from __future__ import annotations

import os
import random
import numpy as np
import torch
import copy


from typing import List, Optional, Dict, Tuple

import cv2
from PIL import Image
import tqdm

import torch.nn as nn
import gc

import torch.nn.functional as F
from torchvision.transforms import (
    Compose,
    Resize,
    CenterCrop,
    ToTensor,
    Normalize,
    InterpolationMode,
)
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import wandb
import re
import pandas as pd
import glob

def init_repro(seed: int = 42, deterministic: bool = True):
    """Call this at the very top of your notebook/script BEFORE creating any model/processor/device context."""
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = (
        ":16:8"  # deterministic cuBLAS on Ampere+, nice default
    )
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"

    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Determinism knobs (do this before any CUDA ops)
    if deterministic:
        try:
            torch.use_deterministic_algorithms(True)
        except Exception:
            # older torch may not support signature
            torch.set_deterministic(True)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.backends.cuda.matmul.allow_tf32 = False
        torch.backends.cudnn.allow_tf32 = False

    # Reduce threading nondeterminism
    torch.set_num_threads(1)

    return seed

def get_torch_device(prefer: Optional[str] = None) -> torch.device:
    if prefer is not None:
        pref = prefer.lower()
        if pref == "cuda" and torch.cuda.is_available():
            return torch.device("cuda")
        if (
            pref == "mps"
            and hasattr(torch.backends, "mps")
            and torch.backends.mps.is_available()
        ):
            return torch.device("mps")
        if pref == "cpu":
            return torch.device("cpu")
    if torch.cuda.is_available():
        return torch.device("cuda")
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")


def pad_batch_sequences(
    seqs: List[torch.Tensor], device: torch.device
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
    """
    Pad a list of [T_i, C] tensors into a batch [B, T_max, C] and return
    a key_padding_mask [B, T_max] with True for padded positions.
    """
    if len(seqs) == 0:
        raise ValueError("pad_batch_sequences received empty sequence list")
    lengths = [int(s.shape[0]) for s in seqs]
    C = int(seqs[0].shape[1])
    T_max = int(max(lengths))
    B = len(seqs)
    batch = torch.zeros((B, T_max, C), dtype=torch.float32, device=device)
    mask = torch.ones((B, T_max), dtype=torch.bool, device=device)  # True=padded
    for i, s in enumerate(seqs):
        t = lengths[i]
        batch[i, :t, :] = s.to(device)
        mask[i, :t] = False
    return batch, mask


def compute_concept_standardization(seqs: List[torch.Tensor | np.ndarray]):
    cat = torch.cat(
        [
            (
                s
                if isinstance(s, torch.Tensor)
                else torch.tensor(np.array(s), dtype=torch.float32)
            )
            for s in seqs
        ],
        dim=0,
    )
    mean = cat.mean(dim=0)
    std = cat.std(dim=0).clamp_min(1e-6)
    return mean, std


def apply_standardization(
    seqs: List[torch.Tensor | np.ndarray], mean: torch.Tensor, std: torch.Tensor
):
    out = []
    for s in seqs:
        s_t = (
            s
            if isinstance(s, torch.Tensor)
            else torch.tensor(np.array(s), dtype=torch.float32)
        )
        out.append((s_t - mean) / std)
    return out


def concepts_over_time_cosine(
    concepts: torch.Tensor,
    all_data_list,
    device: torch.device = torch.device("cpu"),
    dtype: torch.dtype = torch.float32,
    chunk_size: int | None = None,
):
    """
    Cosine-sim per frame vs concepts.
    - Normalizes in fp32 for stability, computes in fp32, then returns on CPU.
    - Optional chunked matmul to cap peak memory.
    """
    with torch.no_grad():
        # normalize concepts in fp32 on target device
        c = F.normalize(
            concepts.detach().to(device=device, dtype=torch.float32), dim=1
        )  # [K,C]
        K = c.shape[0]

        activations, embeddings = [], []

        for vid in all_data_list:
            x = vid if isinstance(vid, torch.Tensor) else torch.as_tensor(vid)
            if x.ndim == 1:
                x = x.unsqueeze(0)
            elif x.ndim > 2:
                x = x.view(-1, x.size(-1))
            x = x.detach().to(device=device, dtype=torch.float32)  # [T,C]

            if x.numel() == 0:
                sim = torch.empty((0, K), dtype=torch.float32, device=device)
            else:
                x = F.normalize(x, dim=1)
                if chunk_size is None or x.shape[0] <= chunk_size:
                    sim = x @ c.T  # [T,K]
                else:
                    # chunk over T to limit peak memory
                    outs = []
                    for s in range(0, x.shape[0], chunk_size):
                        outs.append(x[s : s + chunk_size] @ c.T)
                    sim = torch.cat(outs, dim=0)
                sim = torch.clamp(sim, min=0.0)

            # return CPU fp32
            activations.append(sim.to("cpu", dtype=dtype))
            embeddings.append(vid)  # keep original reference if needed

    return activations, embeddings


class PositionalEncoding(nn.Module):
    """
    Supports both [T, C] and [B, T, C] input tensors, automatically unsqueezing and squeezing as needed for 2D input.
    """

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 1000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(
            1
        )  # [max_len,1]
        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float32)
            * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, d_model, dtype=torch.float32)  # [max_len, C]

        # Handle even and odd indices separately to avoid dimension mismatch
        pe[:, 0::2] = torch.sin(position * div_term)
        if d_model % 2 == 0:
            # Even d_model: use same div_term for cosine
            pe[:, 1::2] = torch.cos(position * div_term)
        else:
            # Odd d_model: need one more element for cosine
            div_term_cos = torch.exp(
                torch.arange(0, d_model - 1, 2, dtype=torch.float32)
                * (-math.log(10000.0) / d_model)
            )
            pe[:, 1::2] = torch.cos(position * div_term_cos)

        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor):
        """
        Handles both 2D and 3D input, automatically unsqueezing and squeezing for [T, C] input. Positional encoding is broadcast over the batch dimension.
        """
        squeeze_back = False
        if x.dim() == 2:  # [T, C] -> [1, T, C]
            x = x.unsqueeze(0)
            squeeze_back = True
        seq_len = x.size(1)
        x = x + self.pe[:seq_len, :]  # broadcast over batch
        x = self.dropout(x)
        if squeeze_back:
            x = x.squeeze(0)
        return x


# -------------------------
# Diagonal (per-channel) Q/K/V + per-channel FFN
# -------------------------
class DiagQKVd(nn.Module):
    """Per-channel Q/K/V with width d (no cross-concept mixing)."""

    def __init__(self, C: int, d: int = 8, bias: bool = True):
        super().__init__()
        self.C, self.d = C, d
        # groups=C keeps channels isolated; each channel gets d features
        self.q = nn.Conv1d(C, C * d, 1, groups=C, bias=bias)
        self.k = nn.Conv1d(C, C * d, 1, groups=C, bias=bias)
        self.v = nn.Conv1d(C, C * d, 1, groups=C, bias=bias)

    def forward(self, x):  # x: [B,T,C]
        B, T, C = x.shape
        xc = x.transpose(1, 2)  # [B,C,T]
        Q = self.q(xc).transpose(1, 2).view(B, T, C, self.d)  # [B,T,C,d]
        K = self.k(xc).transpose(1, 2).view(B, T, C, self.d)
        V = self.v(xc).transpose(1, 2).view(B, T, C, self.d)
        return Q, K, V

class ChannelTimeNorm(nn.Module):
    def __init__(self, C, eps=1e-5, affine=True):
        super().__init__()
        self.ln = nn.LayerNorm(C, eps=eps, elementwise_affine=affine)

    def forward(self, x):  # x: [B,T,C]
        return self.ln(x)


class PerChannelFFN(nn.Module):
    """Per-channel FFN (no cross-concept mixing)."""

    def __init__(self, C: int, dropout: float = 0.1):
        super().__init__()
        self.fc1 = nn.Conv1d(
            C, C, kernel_size=1, groups=C, bias=True
        )  # group equals C to have no channel mixing!
        self.fc2 = nn.Conv1d(C, C, kernel_size=1, groups=C, bias=True)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor):
        # x: [B, T, C]
        xc = x.transpose(1, 2)  # [B, C, T]
        y = self.fc2(self.drop(self.act(self.fc1(xc))))
        return y.transpose(1, 2)  # [B, T, C]


class PerChannelTemporalBlock(nn.Module):
    """
    Attention over time for each concept channel independently.
    Stores attn_weights: [B, C, T, T].
    """

    def __init__(self, C: int, d: int = 1, dropout: float = 0.1, T_max: int = 1024):
        super().__init__()
        self.C, self.d = C, d
        self.qkv = DiagQKVd(C, d)
        self.scale = d**-0.5
        self.logit_scale = nn.Parameter(torch.zeros(C))  # per-concept multiplier

        self.norm1 = ChannelTimeNorm(C)
        self.norm2 = ChannelTimeNorm(C)
        self.drop = nn.Dropout(dropout)

        self.ffn = PerChannelFFN(C, dropout=dropout)

        self.act = nn.GELU()

        self.attn_weights = None

    def forward(
        self,
        x: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,
        key_padding_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        B, T, C = x.shape

        # Pre-attention norm
        y = self.norm1(x)  # [B, T, C]

        # Per-channel QKV: Q/K/V are [B, T, C, d]
        Q, K, V = self.qkv(y)

        # Attention logits per channel: [B, C, T, T]
        scores = torch.einsum("btcd,bucd->bctu", Q, K) * self.scale

        # Optional masks
        if attn_mask is not None:
            # treat bool as additive -inf mask; float as-is
            if attn_mask.dtype == torch.bool:
                am = torch.zeros_like(attn_mask, dtype=scores.dtype)
                am = am.masked_fill(attn_mask, float("-inf"))
            else:
                am = attn_mask.to(dtype=scores.dtype)
            scores = scores + am.view(1, 1, T, T)

        if key_padding_mask is not None:
            kpm = key_padding_mask.view(B, 1, 1, T)  # True = masked
            scores = scores.masked_fill(kpm, float("-inf"))

        # Softmax over source time axis
        w = torch.softmax(scores, dim=-1)  # [B, C, T, T]
        self.attn_weights = w.detach()

        # Weighted sum of values, then reduce d
        out = torch.einsum("bctu,bucd->btcd", w, V).mean(dim=-1)  # [B, T, C]

        # Residual + dropout
        x = x + self.drop(out)

        # Post-attention norm + per-channel FFN (already expects [B,T,C])
        z = self.norm2(x)
        z = self.ffn(z)

        # Residual + dropout
        x = x + self.drop(z)
        return x


def _pick_num_heads(C: int, proposed: Optional[int]) -> int:
    if proposed is not None and proposed >= 1 and C % proposed == 0:
        return proposed
    for h in [8, 6, 4, 3, 2]:
        if h <= C and C % h == 0:
            return h
    return 1


class FullAttentionTemporalBlock(nn.Module):
    """
    Full multi-head self-attention over time with channel mixing (manual implementation).
    """

    def __init__(
        self,
        C: int,
        num_heads: Optional[int] = None,
        dropout: float = 0.1,
        ffn_mult: int = 4,
    ):
        super().__init__()
        self.C = C
        self.H = _pick_num_heads(C, num_heads)
        self.d = C // self.H
        assert self.H * self.d == C, "C must be divisible by num_heads"

        # Projections (mix channels)
        self.q_proj = nn.Linear(C, C, bias=True)
        self.k_proj = nn.Linear(C, C, bias=True)
        self.v_proj = nn.Linear(C, C, bias=True)
        self.o_proj = nn.Linear(C, C, bias=True)

        self.attn_drop = nn.Dropout(dropout)
        self.proj_drop = nn.Dropout(dropout)

        self.ffn = nn.Sequential(
            nn.Linear(C, ffn_mult * C),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ffn_mult * C, C),
        )
        self.dropout = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(C)
        self.norm2 = nn.LayerNorm(C)

        self.attn_weights = None  # [B, H, T, T]

    def _shape_heads(self, x: torch.Tensor) -> torch.Tensor:
        # [B, T, C] -> [B, H, T, d]
        B, T, _ = x.shape
        return x.view(B, T, self.H, self.d).permute(0, 2, 1, 3)

    def forward(
        self,
        x: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,  # [T, T]
        key_padding_mask: Optional[torch.Tensor] = None,  # [B, T]
    ) -> torch.Tensor:
        assert x.dim() == 3, "x must be [B, T, C]"
        B, T, C = x.shape
        assert C == self.C

        # Projections
        Q = self._shape_heads(self.q_proj(x))  # [B,H,T,d]
        K = self._shape_heads(self.k_proj(x))  # [B,H,T,d]
        V = self._shape_heads(self.v_proj(x))  # [B,H,T,d]

        # Scaled dot-product attention
        scale = self.d**-0.5
        scores = torch.matmul(Q, K.transpose(-2, -1)) * scale  # [B,H,T,T]

        # Masks
        if attn_mask is not None:
            # bool -> additive mask; float left as-is
            if attn_mask.dtype == torch.bool:
                am = torch.zeros_like(attn_mask, dtype=Q.dtype)  # 0 keep
                am = am.masked_fill(attn_mask, float("-inf"))
            else:
                am = attn_mask.to(dtype=Q.dtype)
            scores = scores + am.view(1, 1, T, T)

        if key_padding_mask is not None:
            kpm = key_padding_mask.to(torch.bool).view(
                B, 1, 1, T
            )  # broadcast on heads & queries
            scores = scores.masked_fill(kpm, float("-inf"))

        weights = F.softmax(scores, dim=-1)  # [B,H,T,T]
        weights = self.attn_drop(weights)
        self.attn_weights = weights.detach()

        out = torch.matmul(weights, V)  # [B,H,T,d]
        out = out.permute(0, 2, 1, 3).contiguous()  # [B,T,H,d]
        out = out.view(B, T, C)  # [B,T,C]
        out = self.o_proj(out)
        out = self.proj_drop(out)

        # Residual + norm
        x = self.norm1(x + out)

        # FFN + residual + norm
        ff = self.ffn(x)
        x = self.norm2(x + self.dropout(ff))
        return x


class MoTIF:
    """
    MoTIF model for video classification using concept bottleneck models.
    Assumes:
      - concepts_over_time_cosine returns signed cosine sims (no clamp).
      - self.model(window_embeddings, key_padding_mask) returns (logits, concepts, concepts_t, sharpness)
    """

    @staticmethod
    def _collate_pad(batch):
        """
        batch: list of tuples (seq:[T,C] CPU float32, y:int)
        Returns CPU pinned tensors to enable non_blocking .to(device)
        """
        B = len(batch)
        T = max(seq.shape[0] for seq, _ in batch)
        C = batch[0][0].shape[1]
        x = torch.zeros((B, T, C), dtype=torch.float32)
        mask = torch.ones((B, T), dtype=torch.bool)  # True = padded
        y = torch.empty((B,), dtype=torch.long)
        for i, (seq, yi) in enumerate(batch):
            t = seq.shape[0]
            x[i, :t].copy_(seq)  # CPU->CPU copy into pinned
            mask[i, :t] = False
            y[i] = yi
        return x, mask, y

    def __init__(self, embedder, concepts):
        self.device = get_torch_device(prefer="cuda")

        self.concepts = concepts
        self.all_data = embedder.video_embeddings  # dict: path -> [T,C]
        self.all_labels = (
            embedder.labels
        )  # list aligned with keys order (non-SSv2 case)
        self.video_paths = list(self.all_data.keys())
        self.video_spans = embedder.video_window_spans

        self.concept_bank = concepts.text_embeddings
        self.raw_activations, self.video_embeddings = concepts_over_time_cosine(
            self.concept_bank, list(self.all_data.values())
        )  # list of [T,C]

        keep_idx = [
            i
            for i, act in enumerate(self.raw_activations)
            if isinstance(act, torch.Tensor) and act.shape[0] > 0
        ]
        if len(keep_idx) != len(self.raw_activations):
            removed = len(self.raw_activations) - len(keep_idx)
            self.raw_activations = [self.raw_activations[i] for i in keep_idx]
            self.video_paths = [self.video_paths[i] for i in keep_idx]
            self.all_labels = [self.all_labels[i] for i in keep_idx]  # non-SSv2 path
            self.video_embeddings = [self.video_embeddings[i] for i in keep_idx]
            print(f"[MoTIF] Removed {removed} entries with empty activations.")

        # Stable, aligned numeric IDs (for SSv2)
        self.video_ids = [self.path_to_id(p) for p in self.video_paths]
        self.kept_ids = {vid for vid in self.video_ids if vid is not None}

        # Defer LabelEncoder to preprocess()
        self.encoder = LabelEncoder()
        self.class_weights = None

        self.mean_c, self.std_c = None, None
        self.X_train = self.X_val = self.X_test = None
        self.y_train = self.y_val = self.y_test = None
        self.paths_train = self.paths_val = self.paths_test = None
        self.test_zero_shot = None

        # Model attached later
        self.model = None

    @staticmethod
    def path_to_id(p: str):
        base = os.path.splitext(os.path.basename(p))[0]
        m = re.search(r"(\d+)", base)
        return int(m.group(1)) if m else None

    # -------------------------
    # Zero-shot (vectorized over frames)
    # -------------------------
    @torch.inference_mode()
    def zero_shot(self, concept_embedder, wandb_run=None):
        assert (
            self.test_zero_shot is not None and self.y_test is not None
        ), "Call preprocess(...) first."

        # build text prompts and text embeddings
        class_prompts = ["a video of " + c for c in self.encoder.classes_.tolist()]
        text_embedder = copy.copy(concept_embedder)
        text_embedder.tokenizer = concept_embedder.tokenizer
        text_embedder.model = concept_embedder.model
        text_embedder.embedd_text(class_prompts)  # keep original method name

        # ensure device + dtype
        text_embeddings = text_embedder.text_embeddings.to(self.device, dtype=torch.float32)  # [K, C]
        text_embeddings = F.normalize(text_embeddings, dim=-1)

        # check model type for probability transform
        model_name = getattr(text_embedder, "model_name", "").lower()
        use_siglip = "siglip" in model_name

        if use_siglip:
            # SigLIP style scaling/bias (ensure fp32)
            scale = text_embedder.model.logit_scale.exp().to(self.device).float()
            bias = text_embedder.model.logit_bias.to(self.device).float()  # shape [K] or [1,K]

        # counters
        correct_pooled = 0
        correct_soft_avg = 0
        correct_hard_majority = 0

        for idx, frames in enumerate(self.test_zero_shot):
            # frames -> frame embeddings [T, C] on device
            frame_emb = torch.as_tensor(np.array(frames), device=self.device, dtype=torch.float32)
            frame_emb = F.normalize(frame_emb, dim=-1)  # [T, C]

            # pooled embedding (mean over time) [1, C]
            pooled_emb = F.normalize(frame_emb.mean(dim=0, keepdim=True), dim=-1)  # [1, C]

            # raw logits
            if use_siglip:
                logits_pooled = pooled_emb @ text_embeddings.T
                logits_pooled = logits_pooled * scale + bias  # [1, K]
                logits_per_frame = (frame_emb @ text_embeddings.T) * scale + bias  # [T, K]
                probs_per_frame = logits_per_frame.sigmoid()  # for soft average
            else:
                logits_pooled = pooled_emb @ text_embeddings.T  # [1, K]
                logits_per_frame = frame_emb @ text_embeddings.T  # [T, K]
                probs_per_frame = logits_per_frame.softmax(dim=-1)  # for soft average

            # predictions
            pred_pooled = logits_pooled.argmax(dim=-1).item()                       # mean-pooled embedding
            pred_soft_avg = probs_per_frame.mean(dim=0).argmax().item()             # soft voting (avg probs)

            per_frame_preds = logits_per_frame.argmax(dim=-1)                       # [T]
            counts = torch.bincount(per_frame_preds, minlength=logits_per_frame.size(1))
            pred_hard_majority = counts.argmax().item()                             # hard majority (mode)

            # ground truth
            y = int(self.y_test[idx])

            # update counters
            correct_pooled += int(pred_pooled == y)
            correct_soft_avg += int(pred_soft_avg == y)
            correct_hard_majority += int(pred_hard_majority == y)

        n = max(1, len(self.test_zero_shot))
        acc_pooled = correct_pooled / n
        acc_soft_avg = correct_soft_avg / n
        acc_hard_majority = correct_hard_majority / n

        # logging
        if wandb_run is not None:
            wandb_run.log(
                {
                    "zero_shot_acc_pooled": acc_pooled,
                    "zero_shot_acc_soft_avg": acc_soft_avg,
                    "zero_shot_acc_hard_majority": acc_hard_majority,
                }
            )

        print(
            f"[ZS] pooled={acc_pooled:.4f} | soft-avg={acc_soft_avg:.4f} | hard-majority={acc_hard_majority:.4f}"
        )

        return {
            "acc_pooled": acc_pooled,
            "acc_soft_avg": acc_soft_avg,
            "acc_hard_majority": acc_hard_majority,
        }

    # -------------------------
    # Preprocess (unchanged split logic; at end we build datasets)
    # -------------------------
    def preprocess(self, 
                   dataset: str, 
                   info: Optional[str] = None,
                   test_size: float = 0.2, 
                   random_state: int = 42,):
        binary_array = []

        def get_index(info):
            if info == "s1":
                index = 1
            elif info == "s2":
                index = 2
            elif info == "s3":
                index = 3
            else:
                index = 1
            return index

        if info:
            if dataset == "breakfast":
                RANGES = {
                    "s1": range(3, 16),
                    "s2": range(16, 29),
                    "s3": range(29, 42),
                    "s4": range(42, 54),
                }

                def split_paths_by_group(paths, group_name, ranges=RANGES):
                    if group_name not in ranges:
                        raise ValueError(
                            f"Unknown group '{group_name}'. Expected one of {list(ranges)}"
                        )
                    target = ranges[group_name]
                    for p in paths:
                        if any(re.search(rf"P{num:02}", p) for num in target):
                            binary_array.append(False)
                        else:
                            binary_array.append(True)
                    return binary_array

                binary_array = split_paths_by_group(self.video_paths, info)

            elif dataset == "ucf101":
                index = get_index(info)
                ucf_test_list = (
                    f"../Datasets/UCF101/ucfTrainTestlist/testlist0{index}.txt"
                )
                path_list = pd.read_csv(ucf_test_list, sep=" ", header=None)
                for path in self.video_paths:
                    path_rel = path.split("Video_data/")[1].replace(".mp4", ".avi")
                    binary_array.append(
                        False if path_rel in path_list[0].values else True
                    )

            elif dataset == "hmdb51":
                index = get_index(info)
                labels_path = "../Datasets/HMDB/testTrainMulti_7030_splits/"
                path_text_dirs = glob.glob(os.path.join(labels_path, "*.txt"))
                path_text_dirs_idx = [p for p in path_text_dirs if f"split{index}" in p]
                path_text_dirs_idx.sort()
                path_list_test, path_list_train, path_list_ignore = set(), set(), set()
                for txt_path in path_text_dirs_idx:
                    with open(txt_path, "r") as fh:
                        for line in fh:
                            name, flag = line.strip().split()
                            if flag == "2":
                                path_list_test.add(name)
                            elif flag == "0":
                                path_list_ignore.add(name)
                            else:
                                path_list_train.add(name)
                mask = []
                for vp in self.video_paths:
                    basename = os.path.basename(vp).replace(".mp4", ".avi")
                    if basename in path_list_test:
                        mask.append(False)
                    elif basename in path_list_train:
                        mask.append(True)
                    elif basename in path_list_ignore:
                        mask.append(None)
                    else:
                        mask.append(None)
                kept = [
                    (x, y, p, b, m)
                    for x, y, p, b, m in zip(
                        self.raw_activations,
                        self.all_labels,
                        self.video_paths,
                        self.video_embeddings,
                        mask,
                    )
                    if m is not None
                ]
                if not kept:
                    raise ValueError(
                        "HMDB split produced no usable items. Check paths and split lists."
                    )
                (
                    self.raw_activations,
                    self.all_labels,
                    self.video_paths,
                    self.video_embeddings,
                    mask_kept,
                ) = map(list, zip(*kept))
                self.video_ids = [
                    (
                        int(os.path.splitext(os.path.basename(p))[0])
                        if os.path.splitext(os.path.basename(p))[0].isdigit()
                        else None
                    )
                    for p in self.video_paths
                ]
                self.kept_ids = {vid for vid in self.video_ids if vid is not None}
                binary_array = [True if m else False for m in mask_kept]

            elif dataset == "something2":
                # ===== SSv2 handling =====
                def replace_something(text: str) -> str:
                    return re.sub(r"\[(.*?)\]", r"\1", text)

                val_json = "../Datasets/Something2/labels/validation.json"
                train_json = "../Datasets/Something2/labels/train.json"
                test_json = "../Datasets/Something2/labels/test.json"
                test_csv = "../Datasets/Something2/labels/test-answers.csv"

                df_train = pd.read_json(train_json)
                df_val = pd.read_json(val_json)
                df_test = pd.read_json(test_json)
                train_ids = [int(row[0]) for row in df_train.values.tolist()]
                val_ids = [int(row[0]) for row in df_val.values.tolist()]
                test_ids = [int(row[0]) for row in df_test.values.tolist()]
                train_labels = [replace_something(t) for t in df_train["template"]]
                val_labels = [replace_something(t) for t in df_val["template"]]
                test_tbl = pd.read_csv(
                    test_csv, sep=";", header=None, dtype={0: int, 1: str}
                )
                test_labels_map = dict(zip(test_tbl[0].tolist(), test_tbl[1].tolist()))
                test_labels = [test_labels_map[i] for i in test_ids]
                id2split = {}
                id2split.update(
                    {i: ("train", l) for i, l in zip(train_ids, train_labels)}
                )
                id2split.update({i: ("val", l) for i, l in zip(val_ids, val_labels)})
                id2split.update({i: ("test", l) for i, l in zip(test_ids, test_labels)})

                train_x, val_x, test_x = [], [], []
                train_y, val_y, test_y = [], [], []
                self.test_zero_shot = []
                self.paths_train, self.paths_val, self.paths_test = [], [], []
                self.video_ids = [self.path_to_id(p) for p in self.video_paths]
                missed = 0
                for idx, vid in enumerate(self.video_ids):
                    if vid is None:
                        missed += 1
                        continue
                    entry = id2split.get(vid)
                    if entry is None:
                        missed += 1
                        continue
                    split, lab = entry
                    if split == "train":
                        train_x.append(self.raw_activations[idx])
                        train_y.append(lab)
                        self.paths_train.append(self.video_paths[idx])
                    elif split == "val":
                        val_x.append(self.raw_activations[idx])
                        val_y.append(lab)
                        self.paths_val.append(self.video_paths[idx])
                    elif split == "test":
                        test_x.append(self.raw_activations[idx])
                        test_y.append(lab)
                        self.paths_test.append(self.video_paths[idx])
                        self.test_zero_shot.append(self.video_embeddings[idx])
                if missed:
                    print(
                        f"[SSv2] Skipped {missed} items (no parseable ID or not in official splits)."
                    )

                if len(train_x) == 0:
                    raise RuntimeError(
                        "[SSv2] No training samples matched. Check filename-to-ID parsing and dataset paths."
                    )

                self.encoder = self.encoder.fit(train_y)
                self.X_train, self.y_train = train_x, self.encoder.transform(
                    np.array(train_y, dtype=object)
                )
                self.X_val, self.y_val = val_x, (
                    self.encoder.transform(np.array(val_y, dtype=object))
                    if len(val_x)
                    else (None, None)
                )
                self.X_test, self.y_test = test_x, (
                    self.encoder.transform(np.array(test_y, dtype=object))
                    if len(test_x)
                    else (None, None)
                )

            # ===== end SSv2 =====
            if dataset != "something2":
                self.X_train = [
                    self.raw_activations[i]
                    for i in range(len(self.raw_activations))
                    if binary_array[i]
                ]
                self.X_test = [
                    self.raw_activations[i]
                    for i in range(len(self.raw_activations))
                    if not binary_array[i]
                ]
                self.y_train = [
                    self.all_labels[i]
                    for i in range(len(self.all_labels))
                    if binary_array[i]
                ]
                self.y_test = [
                    self.all_labels[i]
                    for i in range(len(self.all_labels))
                    if not binary_array[i]
                ]
                self.paths_train = [
                    self.video_paths[i]
                    for i in range(len(self.video_paths))
                    if binary_array[i]
                ]
                self.paths_test = [
                    self.video_paths[i]
                    for i in range(len(self.video_paths))
                    if not binary_array[i]
                ]
                self.encoder = self.encoder.fit(self.y_train)
                self.y_train = self.encoder.transform(self.y_train)
                self.y_test = self.encoder.transform(self.y_test)
                self.test_zero_shot = [
                    self.video_embeddings[i]
                    for i in range(len(self.video_embeddings))
                    if not binary_array[i]
                ]

        else:
            # Stratified random split (non-SSv2)
            (
                self.X_train,
                self.X_test,
                self.y_train,
                self.y_test,
                self.paths_train,
                self.paths_test,
            ) = train_test_split(
                self.raw_activations,
                self.all_labels,
                self.video_paths,
                test_size=test_size,
                random_state=random_state,
                stratify=self.all_labels,
            )
            self.encoder = self.encoder.fit(self.y_train)
            self.y_train = self.encoder.transform(self.y_train)
            self.y_test = self.encoder.transform(self.y_test)

        # ----- Standardization -----
        self.mean_c, self.std_c = compute_concept_standardization(self.X_train)
        self.X_train = apply_standardization(self.X_train, self.mean_c, self.std_c)
        self.X_test = apply_standardization(self.X_test, self.mean_c, self.std_c)
        if self.X_val is not None:
            self.X_val = apply_standardization(self.X_val, self.mean_c, self.std_c)

        # ----- Class weights -----
        classes, counts = np.unique(self.y_train, return_counts=True)
        self.class_weights = torch.tensor(counts.max() / counts, dtype=torch.float32)
        self.num_concepts = self.X_train[0].shape[-1]
        self.num_classes = len(classes)

    def train_model(
        self,
        num_epochs: int,
        l1_lambda: float,
        lambda_sparse: float,
        batch_size: int = 8,
        lr: float = 1e-4,
        weight_decay: float = 1e-2,
        enforce_nonneg: bool = True,
        class_weights: bool = True,
        wandb_run: Optional[wandb.WandbRun] = None,
        random_seed: int = 42,
        ckpt_path: Optional[str] = None,
        early_stopping_patience: int = 50,
    ):

        if wandb_run is not None:
            wandb_run.config.update(
                {
                    "num_epochs": num_epochs,
                    "l1_lambda": l1_lambda,
                    "lambda_sparse": lambda_sparse,
                    "lr": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "enforce_nonneg": enforce_nonneg,
                    "class_weights": class_weights,
                    "transformer_layers": self.model.transformer_layers,
                    "lse_tau": self.model.lse_tau,
                    "diagonal_attention": self.model.diagonal_attention,
                    "early_stopping_patience": early_stopping_patience,
                }
            )

        # move model to device
        self.model.to(self.device)
        optimizer = torch.optim.AdamW(
            self.model.parameters(), lr=lr, weight_decay=weight_decay
        )
        if class_weights:
            criterion = nn.CrossEntropyLoss(
                weight=self.class_weights.to(self.device), label_smoothing=0.1
            )
        else:
            criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

        num_train = len(self.X_train)

        best_metric = -float("inf")
        best_state = None
        best_epoch = -1
        epochs_since_improvement = 0
        use_early_stopping = (early_stopping_patience is not None) and (
            len(self.X_test) > 0
        )

        for epoch in range(num_epochs):
            self.model.train()
            correct, total = 0, 0
            last_loss, last_L_sparse = None, None
            epoch_L_sparse_sum, epoch_batches = 0.0, 0

            base_seed = int(getattr(self, "seed", random_seed))
            g = torch.Generator(device="cpu").manual_seed(base_seed + epoch)
            perm_tensor = torch.randperm(num_train, generator=g)
            perm = perm_tensor.tolist()

            for start in range(0, num_train, batch_size):
                end = min(start + batch_size, num_train)
                idx = perm[start:end]
                batch_seqs = [self.X_train[i] for i in idx]
                batch_labels = torch.tensor(
                    [int(self.y_train[i]) for i in idx],
                    dtype=torch.long,
                    device=self.device,
                )

                inputs, pad_mask = pad_batch_sequences(batch_seqs, device=self.device)
                optimizer.zero_grad()

                # updated forward: now returns sharpness
                logits, concepts_, concepts_t, sharpness = self.model(
                    inputs, key_padding_mask=pad_mask
                )

                valid = (~pad_mask).unsqueeze(-1).float()
                last_L_sparse = (concepts_t.abs() * valid).sum() / (
                    valid.sum() * concepts_t.shape[-1]
                ).clamp(min=1.0)

                ce = criterion(logits, batch_labels)
                l1 = l1_lambda * self.model.classifier.weight.abs().sum()
                loss = ce + l1 + lambda_sparse * last_L_sparse
                loss.backward()
                optimizer.step()
                last_loss = loss

                # accumulate for epoch-average L_sparse
                epoch_L_sparse_sum += float(last_L_sparse.detach().item())
                epoch_batches += 1

                if enforce_nonneg:
                    with torch.no_grad():
                        self.model.classifier.weight.clamp_(min=0.0)

                preds = logits.argmax(dim=1)
                correct += int((preds == batch_labels).sum().item())
                total += batch_labels.shape[0]

            acc = correct / max(1, total)
            epoch_L_sparse = epoch_L_sparse_sum / max(1, epoch_batches)

            # ===== evaluation =====
            def evaluate(dataset_X, dataset_y):
                self.model.eval()
                correct, total = 0, 0
                sharpness_vals = []
                with torch.no_grad():
                    for start in range(0, len(dataset_X), batch_size):
                        end = min(start + batch_size, len(dataset_X))
                        batch_seqs = [dataset_X[i] for i in range(start, end)]
                        batch_labels = torch.tensor(
                            [int(dataset_y[i]) for i in range(start, end)],
                            dtype=torch.long,
                            device=self.device,
                        )
                        inputs, pad_mask = pad_batch_sequences(
                            batch_seqs, device=self.device
                        )

                        logits, _, _, sharpness = self.model(
                            inputs, key_padding_mask=pad_mask
                        )
                        preds = logits.argmax(dim=1)
                        correct += int((preds == batch_labels).sum().item())
                        total += batch_labels.shape[0]

                        for b in range(logits.shape[0]):
                            sharpness_vals.append(
                                {
                                    "concepts_max": float(
                                        sharpness["concepts"]["max"][b]
                                        .mean()
                                        .detach()
                                        .cpu()
                                        .item()
                                    ),
                                    "concepts_entropy": float(
                                        sharpness["concepts"]["entropy"][b]
                                        .mean()
                                        .detach()
                                        .cpu()
                                        .item()
                                    ),
                                    "logits_max": float(
                                        sharpness["logits"]["max"][b]
                                        .mean()
                                        .detach()
                                        .cpu()
                                        .item()
                                    ),
                                    "logits_entropy": float(
                                        sharpness["logits"]["entropy"][b]
                                        .mean()
                                        .detach()
                                        .cpu()
                                        .item()
                                    ),
                                }
                            )

                acc = correct / max(1, total)
                if sharpness_vals:
                    mean_sharp = {
                        k: float(np.mean([s[k] for s in sharpness_vals]))
                        for k in sharpness_vals[0]
                    }
                else:
                    mean_sharp = {}
                return acc, mean_sharp

            test_acc, test_sharp = (
                (0.0, {})
                if len(self.X_test) == 0
                else evaluate(self.X_test, self.y_test)
            )
            val_acc, val_sharp = (
                (0.0, {}) if self.X_val is None else evaluate(self.X_val, self.y_val)
            )

            metric = test_acc if len(self.X_test) > 0 else acc

            # ===== checkpointing =====
            if metric > best_metric + 1e-8:
                best_metric = metric
                best_epoch = epoch
                epochs_since_improvement = 0
                best_state = {
                    k: v.detach().cpu().clone()
                    for k, v in self.model.state_dict().items()
                }
                if ckpt_path:
                    tmp = ckpt_path + ".tmp"
                    torch.save(best_state, tmp)
                    os.replace(tmp, ckpt_path)
            else:
                epochs_since_improvement += 1

            # ===== wandb logging =====
            if wandb_run is not None:
                current_lr = (
                    optimizer.param_groups[0]["lr"] if optimizer.param_groups else None
                )
                log_data = {
                    "epoch": epoch + 1,
                    "train_loss": (
                        float(last_loss.item()) if last_loss is not None else None
                    ),
                    "train_acc": acc,
                    "test_acc": test_acc,
                    "val_acc": val_acc if self.X_val is not None else None,
                    "L_sparse": (
                        float(last_L_sparse.item())
                        if last_L_sparse is not None
                        else None
                    ),
                    "learning_rate": current_lr,
                    "best_val_acc": best_metric,
                    "epochs_since_improvement": epochs_since_improvement,
                }
                # add sharpness metrics
                for prefix, sharp in [("test_", test_sharp), ("val_", val_sharp)]:
                    for k, v in sharp.items():
                        log_data[prefix + "sharp_" + k] = v
                wandb_run.log(log_data)

            if epoch % 10 == 0 or epoch == num_epochs - 1:
                msg_loss = (
                    float(last_loss.item()) if last_loss is not None else float("nan")
                )
                msg_sparse = (
                    float(last_L_sparse.item())
                    if last_L_sparse is not None
                    else float("nan")
                )
                print(
                    f"Epoch {epoch+1}/{num_epochs} | loss {msg_loss:.4f} | test_acc {test_acc:.4f} "
                    f"| train_acc {acc:.4f} | L_sparse {msg_sparse:.4f} "
                    f"| best_val {best_metric:.4f} | epochs_no_improve {epochs_since_improvement}"
                )

            # early stopping
            if (
                use_early_stopping
                and epochs_since_improvement >= early_stopping_patience
            ):
                print(
                    f"[MoTIF] Early stopping triggered (no improvement for {epochs_since_improvement} epochs). Stopping at epoch {epoch+1}."
                )
                if wandb_run is not None:
                    wandb_run.log(
                        {
                            "early_stopped_epoch": epoch + 1,
                            "early_stopping_patience": early_stopping_patience,
                        }
                    )
                break

        # ===== restore best =====
        if best_state is not None:
            self.model.load_state_dict(best_state, strict=True)
            self.model.eval()
            print(
                f"[MoTIF] Restored best weights from epoch {best_epoch+1} (metric={best_metric:.4f})."
            )
        else:
            print("[MoTIF] No best_state captured (empty training?).")


# -------------------------
# PerConceptAffine + CBMTransformer using the per-channel temporal block
# -------------------------


class PerConceptAffine(nn.Module):
    def __init__(self, num_concepts: int):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(num_concepts))
        self.bias = nn.Parameter(torch.zeros(num_concepts))

    def forward(self, x: torch.Tensor):
        ## Comment out to test no scaling and bias ablation for paper
        y = F.softplus(x * self.scale + self.bias) - math.log(2.0)
        return y.clamp(min=0.0)


class CBMTransformer(nn.Module):
    def __init__(
        self,
        num_concepts: int,
        num_classes: int,
        transformer_layers: int = 1,
        dropout: float = 0.1,
        lse_tau: float = 1.0,
        nonneg_classifier: bool = False,
        diagonal_attention: bool = True,
        dimension=1,
    ):
        super().__init__()
        self.lse_tau = lse_tau
        self.diagonal_attention = diagonal_attention
        self.transformer_layers = transformer_layers

        self.posenc = PositionalEncoding(
            d_model=num_concepts, dropout=dropout, max_len=2000
        )
        if diagonal_attention:
            self.layers = nn.ModuleList(
                [
                    PerChannelTemporalBlock(
                        C=num_concepts, dropout=dropout, d=dimension
                    )
                    for _ in range(transformer_layers)
                ]
            )
        else:
            self.layers = nn.ModuleList(
                [
                    FullAttentionTemporalBlock(
                        C=num_concepts, num_heads=None, dropout=dropout
                    )
                    for _ in range(transformer_layers)
                ]
            )
        self.norm = nn.LayerNorm(num_concepts)
        self.concept_predictor = PerConceptAffine(num_concepts)

        if nonneg_classifier:
            self.classifier = NonNegativeLinear(num_concepts, num_classes)
        else:
            self.classifier = nn.Linear(num_concepts, num_classes)

        # for introspection
        self.last_time_importance = None  # [B,T] detached
        

    def forward(
        self,
        window_embeddings: torch.Tensor,
        key_padding_mask: Optional[torch.Tensor] = None,
        channel_ids: Optional[Union[List[int], torch.Tensor]] = None,
        window_ids: Optional[Union[List[int], torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        window_embeddings: [B,T,C] or [T,C]
        key_padding_mask: [B,T] with True for padded tokens to be ignored

        Returns:
        logits:     [B,K]    pooled class logits
        concepts:   [B,C]    pooled concept activations
        concepts_t: [B,T,C]  per-time-step concepts
        sharpness:  dict with 'concepts' and 'logits' sharpness per batch
        """
        x = window_embeddings
        if x.dim() == 2:
            x = x.unsqueeze(0)  # [1,T,C]
            if key_padding_mask is not None and key_padding_mask.dim() == 1:
                key_padding_mask = key_padding_mask.unsqueeze(0)

        # --- transformer backbone ---
        x = self.posenc(x)  # [B,T,C]
        for layer in self.layers:
            x = layer(x, key_padding_mask=key_padding_mask)
        x = self.norm(x)  # [B,T,C]

        # --- concept predictions per time step ---
        concepts_t = self.concept_predictor(x)  # [B,T,C]

        # --- concept interventions ---
        if channel_ids is not None and window_ids is not None:
            concepts_t[:, window_ids, channel_ids] = 0
        elif channel_ids is not None:
            concepts_t[:, :, channel_ids] = 0
        elif window_ids is not None:
            concepts_t[:, window_ids, :] = 0

        logits_t = self.classifier(concepts_t)  # [B,T,K]

        tau = self.lse_tau

        # --- LSE pooling over time ---
        if key_padding_mask is not None:
            concepts_t_masked = concepts_t.masked_fill(
                key_padding_mask.unsqueeze(-1), float("-inf")
            )
            logits_t_masked = logits_t.masked_fill(
                key_padding_mask.unsqueeze(-1), float("-inf")
            )

            concepts = (concepts_t_masked * tau).logsumexp(dim=1) / tau  # [B,C]
            logits = (logits_t_masked * tau).logsumexp(dim=1) / tau  # [B,K]
        else:
            concepts = (concepts_t * tau).logsumexp(dim=1) / tau
            logits = (logits_t * tau).logsumexp(dim=1) / tau

        # --- temporal importance for explanation ---
        with torch.no_grad():
            pred = logits.argmax(dim=1)  # [B]
            sel = torch.gather(logits_t, dim=2, index=pred[:, None, None]).squeeze(
                -1
            )  # [B,T]
            if key_padding_mask is not None:
                sel = sel.masked_fill(key_padding_mask, float("-inf"))
            self.last_time_importance = torch.softmax(
                sel / tau, dim=1
            ).detach()  # softmax importance

        # --- compute sharpness of LSE pooled distributions ---
        def compute_sharpness(x_t, mask=None):
            """Compute max / entropy as sharpness metric for batch"""
            if mask is not None:
                x_t = x_t.masked_fill(mask.unsqueeze(-1), float("-inf"))
            probs = torch.softmax(tau * x_t, dim=1)
            probs = probs.clamp(min=1e-8)  # avoids log(0)
            max_prob = probs.max(dim=1).values  # [B]
            entropy = -(probs * probs.log()).sum(dim=1)
            return {"max": max_prob, "entropy": entropy}

        sharpness = {
            "concepts": compute_sharpness(concepts_t, key_padding_mask),
            "logits": compute_sharpness(logits_t, key_padding_mask),
        }

        return logits, concepts, concepts_t, sharpness

    def get_attention_maps(self):
        # list of [B, C, T, T] (detached)
        return [
            layer.attn_weights.cpu() if layer.attn_weights is not None else None
            for layer in self.layers
        ]


def mean_cbm(model, wandb_run=None):
    X_train, X_test = model.X_train.copy(), model.X_test.copy()
    y_train, y_test = model.y_train.copy(), model.y_test.copy()
    num_classes = model.num_classes
    num_concepts = model.num_concepts
    batch_size = 1

    device = getattr(model, "device", get_torch_device())

    random = False # was for testing 
    if random:

        def get_random_image(x):
            idx = np.random.randint(0, len(x))
            return x[idx]

        # Replace each video with a random frame (as np array)
        X_train_random = [get_random_image(x) for x in X_train]
        X_test_random = [get_random_image(x) for x in X_test]

        X_train_mean = X_train_random
        X_test_mean = X_test_random

    else:
        # take mean
        X_train_mean = [torch.mean(x, axis=0) for x in X_train]  # [T,C] -> [C]
        X_test_mean = [torch.mean(x, axis=0) for x in X_test]  # [T,C] -> [C]

    # Stack into arrays before converting to torch tensors
    X_train_arr = np.stack(
        [
            t.cpu().numpy() if isinstance(t, torch.Tensor) else np.array(t)
            for t in X_train_mean
        ]
    )
    X_test_arr = np.stack(
        [
            t.cpu().numpy() if isinstance(t, torch.Tensor) else np.array(t)
            for t in X_test_mean
        ]
    )

    tensor_train = torch.tensor(X_train_arr, dtype=torch.float32, device=device)
    tensor_test = torch.tensor(X_test_arr, dtype=torch.float32, device=device)

    # train a linear model on the random/mean frames

    linear_model = nn.Linear(num_concepts, num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(linear_model.parameters(), lr=0.001)
    num_epochs = 200
    for epoch in range(num_epochs):
        linear_model.train()
        optimizer.zero_grad()
        outputs = linear_model(tensor_train)
        loss = criterion(
            outputs, torch.tensor(y_train, dtype=torch.long, device=device)
        )
        loss.backward()
        optimizer.step()
        if wandb_run is not None:
            with torch.no_grad():
                preds = outputs.argmax(dim=1)
                acc = (preds.detach().cpu().numpy() == y_train).mean()
                current_lr = (
                    optimizer.param_groups[0]["lr"] if optimizer.param_groups else None
                )
                wandb_run.log(
                    {
                        "mean_train_loss": loss.item(),
                        "mean_train_acc": acc,
                        "mean_learning_rate": current_lr,
                    }
                )
    linear_model.eval()
    with torch.no_grad():
        outputs = linear_model(tensor_test)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted.detach().cpu().numpy() == y_test).mean()
    print(f"CBM accuracy test: {accuracy:.4f}")
    if wandb_run is not None:
        wandb_run.log({"mean_test_acc": accuracy})


class NonNegativeLinear:
    def __init__(self, in_features, out_features, bias=True):
        self.linear = nn.Linear(in_features, out_features, bias=bias)

    def forward(self, x):
        self.linear.weight.data.clamp_(min=0.0)
        return self.linear(x)