import torch
import torch.nn as nn
from torch import Tensor
import math

from transformers import PreTrainedModel, ConvNextModel, ConvNextConfig
from transformers.utils import logging
from transformers.modeling_outputs import ModelOutput, BaseModelOutputWithPoolingAndNoAttention
from dataclasses import dataclass

from .configuration_protonet import AudioProtoNetConfig

logger = logging.get_logger(__name__)


@dataclass
class SequenceClassifierOutputWithProtoTypeActivations(ModelOutput):
    logits: torch.Tensor
    loss: torch.Tensor = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: tuple[torch.FloatTensor, ...] = None
    prototype_activations: torch.FloatTensor = None


# https://openaccess.thecvf.com/content/ICCV2021/papers/Ridnik_Asymmetric_Loss_for_Multi-Label_Classification_ICCV_2021_paper.pdf
# https://github.com/huggingface/pytorch-image-models/blob/bbe798317fb26f063c18279827c038058e376479/timm/loss/asymmetric_loss.py#L6
class AsymmetricLossMultiLabel(nn.Module):
    def __init__(
        self,
        gamma_neg=4,
        gamma_pos=1,
        clip=0.05,
        eps=1e-8,
        disable_torch_grad_focal_loss=False,
        reduction="mean",
    ):
        super().__init__()

        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos
        self.clip = clip
        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
        self.eps = eps
        self.reduction = reduction

    def forward(self, x, y):
        """ "
        Parameters
        ----------
        x: input logits
        y: targets (multi-label binarized vector)
        """

        # Calculating Probabilities
        x_sigmoid = torch.sigmoid(x)
        xs_pos = x_sigmoid
        xs_neg = 1 - x_sigmoid

        # Asymmetric Clipping
        if self.clip is not None and self.clip > 0:
            xs_neg = (xs_neg + self.clip).clamp(max=1)

        # Basic CE calculation
        los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
        los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
        loss = los_pos + los_neg

        # Asymmetric Focusing
        if self.gamma_neg > 0 or self.gamma_pos > 0:
            if self.disable_torch_grad_focal_loss:
                torch._C.set_grad_enabled(False)
            pt0 = xs_pos * y
            pt1 = xs_neg * (1 - y)  # pt = p if t > 0 else 1-p
            pt = pt0 + pt1
            one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
            if self.disable_torch_grad_focal_loss:
                torch._C.set_grad_enabled(True)
            loss *= one_sided_w

        if self.reduction == "mean":
            return -loss.mean()
        if self.reduction == "sum":
            return -loss.sum()

        return -loss


class NonNegativeLinear(nn.Module):
    """
    A PyTorch module for a linear layer with non-negative weights.

    This module applies a linear transformation to the incoming data: `y = xA^T + b`.
    The weights of the transformation are constrained to be non-negative, making this
    module particularly useful in models where negative weights may not be appropriate.

    Attributes:
        in_features (int): The number of features in the input tensor.
        out_features (int): The number of features in the output tensor.
        weight (torch.Tensor): The weight parameter of the module, constrained to be non-negative.
        bias (torch.Tensor, optional): The bias parameter of the module.

    Args:
        in_features (int): The number of features in the input tensor.
        out_features (int): The number of features in the output tensor.
        bias (bool, optional): If True, the layer will include a learnable bias. Default: True.
        device (optional): The device (CPU/GPU) on which to perform computations.
        dtype (optional): The data type for the parameters (e.g., float32).
    """

    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        device=None,
        dtype=None,
    ) -> None:
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(
            torch.empty((out_features, in_features), **factory_kwargs)
        )
        if bias:
            self.bias = nn.Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter("bias", None)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        """
        Defines the forward pass of the NonNegativeLinear module.

        Args:
            input (torch.Tensor): The input tensor of shape (batch_size, in_features).

        Returns:
            torch.Tensor: The output tensor of shape (batch_size, out_features).
        """
        return nn.functional.linear(input, torch.relu(self.weight), self.bias)


class LinearLayerWithoutNegativeConnections(nn.Module):
    r"""
    Custom Linear Layer where each output class is connected to a specific subset of input features.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``
        device: the device of the module parameters. Default: ``None``
        dtype: the data type of the module parameters. Default: ``None``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :math:`(\text{out_features}, \text{features_per_output_class})`.
        bias: the learnable bias of the module of shape :math:`(\text{out_features})`.
              If :attr:`bias` is ``True``, the values are initialized from
              :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
              :math:`k = \frac{1}{\text{features_per_output_class}}`
    """

    __constants__ = ["in_features", "out_features", "bias"]
    in_features: int
    out_features: int
    weight: torch.Tensor

    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        non_negative: bool = True,
        device: torch.device = None,
        dtype: torch.dtype = None,
    ) -> None:
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.non_negative = non_negative

        # Calculate the number of features per output class
        self.features_per_output_class = in_features // out_features

        # Ensure input size is divisible by the output size
        assert (
            in_features % out_features == 0
        ), f"{in_features = } must be divisible by {out_features = }"

        # Define weights and biases
        self.weight = nn.Parameter(
            torch.empty(
                (out_features, self.features_per_output_class), **factory_kwargs
            )
        )
        if bias:
            self.bias = nn.Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter("bias", None)

        # Initialize weights and biases
        self.reset_parameters()

    def reset_parameters(self) -> None:
        """
        Initialize the weights and biases.
        Weights are initialized using Kaiming uniform initialization.
        Biases are initialized using a uniform distribution.
        """
        # Kaiming uniform initialization for the weights
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))

        if self.bias is not None:
            # Calculate fan-in and fan-out values
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)

            # Uniform initialization for the biases
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the custom linear layer.

        Args:
            input (Tensor): Input tensor of shape (batch_size, in_features).

        Returns:
            Tensor: Output tensor of shape (batch_size, out_features).
        """
        batch_size = input.size(0)
        # Reshape input to (batch_size, out_features, features_per_output_class)
        reshaped_input = input.view(
            batch_size, self.out_features, self.features_per_output_class
        )

        # Apply ReLU to weights if non_negative_last_layer is True
        weight = torch.relu(self.weight) if self.non_negative else self.weight

        # Perform batch matrix multiplication and add bias
        output = torch.einsum("bof,of->bo", reshaped_input, weight)

        if self.bias is not None:
            output += self.bias

        return output

    def extra_repr(self) -> str:
        return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"


class AudioProtoNetClassificationHead(nn.Module):
    def __init__(
        self,
        config: AudioProtoNetConfig,
    ) -> None:
        """
        PPNet is a class that implements the Prototypical Part Network (ProtoPNet) for prototype-based classification.
        """

        super().__init__()
        self.prototypes_per_class = config.prototypes_per_class
        self.num_classes = config.num_classes
        self.num_prototypes = self.prototypes_per_class * self.num_classes
        self.num_prototypes_after_pruning = config.num_prototypes_after_pruning
        self.margin = config.margin
        self.relu_on_cos = config.relu_on_cos
        self.incorrect_class_connection = config.incorrect_class_connection
        self.correct_class_connection = config.correct_class_connection
        self.input_vector_length = config.input_vector_length
        self.n_eps_channels = config.n_eps_channels
        self.epsilon_val = config.epsilon_val
        self.topk_k = config.topk_k
        self.bias_last_layer = config.bias_last_layer
        self.non_negative_last_layer = config.non_negative_last_layer
        self.embedded_spectrogram_height = config.embedded_spectrogram_height
        self.use_bias_last_layer = config.use_bias_last_layer
        self.prototype_class_identity = config.prototype_class_identity

        # Create a 1D tensor where each element represents the class index
        self.prototype_class_identity = (
            torch.arange(self.num_prototypes) // self.prototypes_per_class
        )

        self.prototype_shape = (self.num_prototypes, config.channels, config.height, config.width)

        self._setup_add_on_layers(add_on_layers_type=config.add_on_layers_type)

        self.prototype_vectors = nn.Parameter(
            torch.rand(self.prototype_shape), requires_grad=True
        )

        self.frequency_weights = None
        if self.embedded_spectrogram_height is not None:
            # Initialize the frequency weights with a large positive value of 3.0 so that sigmoid(frequency_weights) is close to 1.
            self.frequency_weights = nn.Parameter(
                torch.full(
                    (
                        self.num_prototypes,
                        self.embedded_spectrogram_height,
                    ),
                    3.0,
                )
            )


        if self.incorrect_class_connection:
            if self.non_negative_last_layer:
                self.last_layer = NonNegativeLinear(
                    self.num_prototypes, self.num_classes, bias=self.use_bias_last_layer
                )
            else:
                self.last_layer = nn.Linear(
                    self.num_prototypes, self.num_classes, bias=self.use_bias_last_layer
                )
        else:
            self.last_layer = LinearLayerWithoutNegativeConnections(
                in_features=self.num_prototypes,
                out_features=self.num_classes,
                non_negative=self.non_negative_last_layer,
            )

    def forward(
        self,
        features: torch.Tensor,
        prototypes_of_wrong_class: torch.Tensor = None,
    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        """
        Forward pass of the PPNet model.

        Args:
        - x (torch.Tensor): Input tensor with shape (batch_size, num_channels, height, width).
        - prototypes_of_wrong_class (Optional[torch.Tensor]): The prototypes of the wrong classes that are needed
            when using subtractive margins. Defaults to None.

        Returns:
            Tuple[torch.Tensor, List[torch.Tensor]]:
            - logits: A tensor containing the logits for each class in the model.
            - a list containing:
                - mean_activations: A tensor containing the mean of the top-k prototype activations.
                (in evaluation mode k is always 1)
                - marginless_logits: A tensor containing the logits for each class in the model, calculated using the
                marginless activations.
                - conv_features: A tensor containing the convolutional features.
                - marginless_max_activations: A tensor containing the max-pooled marginless activations.

        """

        features = self.add_on_layers(features)

        activations, additional_returns = self.prototype_activations(
            features, prototypes_of_wrong_class=prototypes_of_wrong_class
        )
        marginless_activations = additional_returns[0]
        conv_features = additional_returns[1]

        # Set topk_k based on training mode: use predefined value if training, else 1 for evaluation
        topk_k = 1

        # Reshape activations to combine spatial dimensions: (batch_size, num_prototypes, height*width)
        activations = activations.view(activations.shape[0], activations.shape[1], -1)

        # Perform top-k pooling along the combined spatial dimension
        # For topk_k=1, this is equivalent to global max pooling
        topk_activations, _ = torch.topk(activations, topk_k, dim=-1)

        # Calculate the mean of the top-k activations for each channel: (batch_size, num_channels)
        # If topk_k=1, this mean operation does nothing since there's only one value.
        mean_activations = torch.mean(topk_activations, dim=-1)

        marginless_max_activations = nn.functional.max_pool2d(
            marginless_activations,
            kernel_size=(
                marginless_activations.size()[2],
                marginless_activations.size()[3],
            ),
        )
        marginless_max_activations = marginless_max_activations.view(
            -1, self.num_prototypes
        )

        logits = self.last_layer(mean_activations)
        marginless_logits = self.last_layer(marginless_max_activations)
        return logits, [
            mean_activations,
            marginless_logits,
            conv_features,
            marginless_max_activations,
            marginless_activations,
        ]

    # def conv_features(self, x: torch.Tensor) -> torch.Tensor:
    #     """
    #     Takes an input tensor and passes it through the backbone model to extract features.
    #     Then, it passes them through the additional layers to produce the output tensor.
    #
    #     Args:
    #         x (torch.Tensor): The input tensor.
    #
    #     Returns:
    #         torch.Tensor: The output tensor after passing through the backbone model and additional layers.
    #     """
    #     # Extract features using the backbone model
    #     features = self.backbone_model(x)
    #
    #     # The features must be a 4D tensor of shape (batch size, channels, height, width)
    #     if features.dim() == 3:
    #         features.unsqueeze_(0)
    #
    #     # Pass the features through additional layers
    #     output = self.add_on_layers(features)
    #
    #     return output

    def cos_activation(
        self,
        x: torch.Tensor,
        prototypes_of_wrong_class: torch.Tensor = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Compute the cosine activation between input tensor x and prototype vectors.

        Parameters:
        -----------
        x : torch.Tensor
            Input tensor with shape (batch_size, num_channels, height, width).
        prototypes_of_wrong_class : Optional[torch.Tensor]
            Tensor containing the prototypes of the wrong class with shape (batch_size, num_prototypes).

        Returns:
        --------
        Tuple[torch.Tensor, torch.Tensor]
            A tuple containing:
            - activations: The cosine activations with potential margin adjustments.
            - marginless_activations: The cosine activations without margin adjustments.
        """
        input_vector_length = self.input_vector_length
        normalizing_factor = (
            self.prototype_shape[-2] * self.prototype_shape[-1]
        ) ** 0.5

        # Pre-allocate epsilon channels on the correct device for input tensor x
        epsilon_channel_x = torch.full(
            (x.shape[0], self.n_eps_channels, x.shape[2], x.shape[3]),
            self.epsilon_val,
            device=x.device,
            requires_grad=False,
        )
        x = torch.cat((x, epsilon_channel_x), dim=-3)

        # Normalize x
        x_length = torch.sqrt(torch.sum(x**2, dim=-3, keepdim=True) + self.epsilon_val)
        x_normalized = (input_vector_length * x / x_length) / normalizing_factor

        # Pre-allocate epsilon channels for prototypes on the correct device
        epsilon_channel_p = torch.full(
            (
                self.prototype_shape[0],
                self.n_eps_channels,
                self.prototype_shape[2],
                self.prototype_shape[3],
            ),
            self.epsilon_val,
            device=self.prototype_vectors.device,
            requires_grad=False,
        )
        appended_protos = torch.cat((self.prototype_vectors, epsilon_channel_p), dim=-3)

        # Normalize prototypes
        prototype_vector_length = torch.sqrt(
            torch.sum(appended_protos**2, dim=-3, keepdim=True) + self.epsilon_val
        )
        normalized_prototypes = appended_protos / (
            prototype_vector_length + self.epsilon_val
        )
        normalized_prototypes /= normalizing_factor

        # Compute activations using convolution
        activations_dot = nn.functional.conv2d(x_normalized, normalized_prototypes)
        marginless_activations = activations_dot / (input_vector_length * 1.01)

        if self.frequency_weights is not None:
            # Apply sigmoid to frequency weights. s.t. weights are between 0 and 1.
            freq_weights = torch.sigmoid(self.frequency_weights)

            # Multiply each prototype's frequency response by the corresponding weights
            marginless_activations = marginless_activations * freq_weights[:, :, None]

        if (
            self.margin is None
            or not self.training
            or prototypes_of_wrong_class is None
        ):
            activations = marginless_activations
        else:
            # Apply margin adjustment for wrong class prototypes
            wrong_class_margin = (prototypes_of_wrong_class * self.margin).view(
                x.size(0), self.prototype_vectors.size(0), 1, 1
            )
            wrong_class_margin = wrong_class_margin.expand(
                -1, -1, activations_dot.size(-2), activations_dot.size(-1)
            )
            penalized_angles = (
                torch.acos(activations_dot / (input_vector_length * 1.01))
                - wrong_class_margin
            )
            activations = torch.cos(torch.relu(penalized_angles))

        if self.relu_on_cos:
            # Apply ReLU activation on the cosine values
            activations = torch.relu(activations)
            marginless_activations = torch.relu(marginless_activations)

        return activations, marginless_activations

    def prototype_activations(
        self,
        x: torch.Tensor,
        prototypes_of_wrong_class: torch.Tensor = None,
    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        """
        Compute the prototype activations for a given input tensor.

        Args:
        - x (torch.Tensor): The raw input tensor with shape (batch_size, num_channels, height, width).
        - prototypes_of_wrong_class (Optional[torch.Tensor]): The prototypes of the wrong classes that are needed
            when using subtractive margins. Defaults to None.

        Returns:
            Tuple[torch.Tensor, List[torch.Tensor]]:
            - activations: A tensor containing the prototype activations.
            - a list containing:
                - marginless_activations: A tensor containing the activations before applying subtractive margin.
                - conv_features: A tensor containing the convolutional features.
        """
        # Compute cosine activations
        activations, marginless_activations = self.cos_activation(
            x,
            prototypes_of_wrong_class=prototypes_of_wrong_class,
        )

        return activations, [marginless_activations, x]

    def get_prototype_orthogonalities(self, use_part_prototypes: bool = False) -> torch.Tensor:
        """
        Computes the orthogonality loss, encouraging each piece of a prototype to be orthogonal to the others.

        This method is inspired by the paper:
        https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Interpretable_Image_Recognition_by_Constructing_Transparent_Embedding_Space_ICCV_2021_paper.pdf

        Args:
            use_part_prototypes (bool): If True, treats each spatial part of the prototypes as a separate prototype.

        Returns:
            torch.Tensor: A tensor representing the orthogonalities.
        """

        if use_part_prototypes:
            # Normalize prototypes to unit length
            prototype_vector_length = torch.sqrt(
                torch.sum(torch.square(self.prototype_vectors), dim=1, keepdim=True)
                + self.epsilon_val
            )
            normalized_prototypes = self.prototype_vectors / (
                prototype_vector_length + self.epsilon_val
            )

            # Calculate total part prototypes per class
            num_part_prototypes_per_class = (
                self.num_prototypes_per_class
                * self.prototype_shape[2]
                * self.prototype_shape[3]
            )

            # Reshape to match class structure
            normalized_prototypes = normalized_prototypes.view(
                self.num_classes,
                self.num_prototypes_per_class,
                self.prototype_shape[1],
                self.prototype_shape[2] * self.prototype_shape[3],
            )

            # Transpose and reshape to treat each spatial part as a separate prototype
            normalized_prototypes = normalized_prototypes.permute(0, 1, 3, 2).reshape(
                self.num_classes, num_part_prototypes_per_class, self.prototype_shape[1]
            )

        else:
            # Normalize prototypes to unit length
            prototype_vectors_reshaped = self.prototype_vectors.view(
                self.num_prototypes, -1
            )
            prototype_vector_length = torch.sqrt(
                torch.sum(torch.square(prototype_vectors_reshaped), dim=1, keepdim=True)
                + self.epsilon_val
            )
            normalized_prototypes = prototype_vectors_reshaped / (
                prototype_vector_length + self.epsilon_val
            )

            # Reshape to match class structure
            normalized_prototypes = normalized_prototypes.view(
                self.num_classes,
                self.num_prototypes_per_class,
                self.prototype_shape[1]
                * self.prototype_shape[2]
                * self.prototype_shape[3],
            )

        # Compute orthogonality matrix for each class
        orthogonalities = torch.matmul(
            normalized_prototypes, normalized_prototypes.transpose(1, 2)
        )

        # Identity matrix to enforce orthogonality
        identity_matrix = (
            torch.eye(normalized_prototypes.shape[1], device=orthogonalities.device)
            .unsqueeze(0)
            .repeat(self.num_classes, 1, 1)
        )

        # Subtract identity to focus on orthogonality
        orthogonalities = orthogonalities - identity_matrix

        return orthogonalities

    def identify_prototypes_to_prune(self) -> list[int]:
        """
        Identifies the indices of prototypes that should be pruned.

        This function iterates through the prototypes and checks if the specific weight
        connecting the prototype to its class is zero. It is specifically designed to handle
        the LinearLayerWithoutNegativeConnections where each class has a subset of features
        it connects to.

        Returns:
            list[int]: A list of prototype indices that should be pruned.
        """
        prototypes_to_prune = []

        # Calculate the number of prototypes assigned to each class
        prototypes_per_class = self.num_prototypes // self.num_classes

        if isinstance(self.last_layer, LinearLayerWithoutNegativeConnections):
            # Custom layer mapping prototypes to a subset of input features for each output class
            for prototype_index in range(self.num_prototypes):
                class_index = self.prototype_class_identity[prototype_index]
                # Calculate the specific index within the 'features_per_output_class' for this prototype
                index_within_class = prototype_index % prototypes_per_class
                # Check if the specific weight connecting the prototype to its class is zero
                if self.last_layer.weight[class_index, index_within_class] == 0.0:
                    prototypes_to_prune.append(prototype_index)
        else:
            # Standard linear layer: each prototype directly maps to a feature index
            weights_to_check = self.last_layer.weight
            for prototype_index in range(self.num_prototypes):
                class_index = self.prototype_class_identity[prototype_index]
                if weights_to_check[class_index, prototype_index] == 0.0:
                    prototypes_to_prune.append(prototype_index)

        return prototypes_to_prune

    def prune_prototypes_by_threshold(self, threshold: float = 1e-3) -> None:
        """
        Prune the weights in the classification layer by setting weights below a specified threshold to zero.

        This method modifies the weights of the last layer of the model in-place. Weights falling below the
        threshold are set to zero, diminishing their influence in the model's decisions. It also identifies
        and prunes prototypes based on these updated weights, thereby refining the model's structure.

        Args:
            threshold (float): The threshold value below which weights will be set to zero. Defaults to 1e-3.
        """
        # Access the weights of the last layer
        weights = self.last_layer.weight.data

        # Set weights below the threshold to zero
        # This step reduces the influence of low-value weights in the model's decision-making process
        weights[weights < threshold] = 0.0

        # Update the weights in the last layer to reflect the pruning
        self.last_layer.weight.data.copy_(weights)

        # Identify prototypes that need to be pruned based on the updated weights
        prototypes_to_prune = self.identify_prototypes_to_prune()

        # Execute the pruning of identified prototypes
        self.prune_prototypes_by_index(prototypes_to_prune)

    def prune_prototypes_by_index(self, prototypes_to_prune: list[int]) -> None:
        """
        Prunes specified prototypes from the PPNet.

        Args:
            prototypes_to_prune (list[int]): A list of indices indicating the prototypes to be removed.
                                             Each index should be in the range [0, current number of prototypes - 1].

        Returns:
            None
        """

        # Validate the provided indices to ensure they are within the valid range
        if any(
            index < 0 or index >= self.num_prototypes for index in prototypes_to_prune
        ):
            raise ValueError("Provided prototype indices are out of valid range!")

        # Calculate the new number of prototypes after pruning
        self.num_prototypes_after_pruning = self.num_prototypes - len(
            prototypes_to_prune
        )

        # Remove the prototype vectors that are no longer needed
        with torch.no_grad():
            # If frequency_weights are being used, set the weights of pruned prototypes to -7
            if self.frequency_weights is not None:
                self.frequency_weights.data[prototypes_to_prune, :] = -7.0

            # Adjust the weights in the last layer depending on its type
            if isinstance(self.last_layer, LinearLayerWithoutNegativeConnections):
                # For LinearLayerWithoutNegativeConnections, set the connection weights to zero
                # only for the pruned prototypes related to their specific classes
                for class_idx in range(self.last_layer.out_features):
                    # Identify prototypes belonging to the current class
                    indices_for_class = [
                        idx % self.last_layer.features_per_output_class
                        for idx in prototypes_to_prune
                        if self.prototype_class_identity[idx] == class_idx
                    ]
                    self.last_layer.weight.data[class_idx, indices_for_class] = 0.0
            else:
                # For other layer types, set the weights of pruned prototypes to zero
                self.last_layer.weight.data[:, prototypes_to_prune] = 0.0

    def __repr__(self) -> str:
        rep = f"""PPNet(
        prototype_shape: {self.prototype_shape},
        num_classes: {self.num_classes},
        epsilon: {self.epsilon_val})"""

        return rep

    def set_last_layer_incorrect_connection(
        self, incorrect_strength: float = None
    ) -> None:
        """
        Modifies the last layer weights to have incorrect connections with a specified strength.
        If incorrect_strength is None, initializes the weights for LinearLayerWithoutNegativeConnections
        with correct_class_connection value.

        Args:
        - incorrect_strength (Optional[float]): The strength of the incorrect connections.
                                                If None, initialize without incorrect connections.

        Returns:
            None
        """
        if incorrect_strength is None:
            # Handle LinearLayerWithoutNegativeConnections initialization
            if isinstance(self.last_layer, LinearLayerWithoutNegativeConnections):
                # Initialize all weights to the correct_class_connection value
                self.last_layer.weight.data.fill_(self.correct_class_connection)
            else:
                raise ValueError(
                    "last_layer is not an instance of LinearLayerWithoutNegativeConnections"
                )

        else:
            # Create a one-hot matrix for correct connections
            positive_one_weights_locations = torch.zeros(
                self.num_classes, self.num_prototypes
            )
            positive_one_weights_locations[
                self.prototype_class_identity,
                torch.arange(self.num_prototypes),
            ] = 1

            # Create a matrix for incorrect connections
            negative_one_weights_locations = 1 - positive_one_weights_locations

            # This variable represents the strength of the connection for correct class
            correct_class_connection = self.correct_class_connection

            # This variable represents the strength of the connection for incorrect class
            incorrect_class_connection = incorrect_strength

            # Modify weights to have correct and incorrect connections
            self.last_layer.weight.data.copy_(
                correct_class_connection * positive_one_weights_locations
                + incorrect_class_connection * negative_one_weights_locations
            )

        if self.last_layer.bias is not None:
            # Initialize all biases to bias_last_layer value
            self.last_layer.bias.data.fill_(self.bias_last_layer)

    def _setup_add_on_layers(self, add_on_layers_type: str):
        """
        Configures additional layers based on the backbone model architecture and the specified add_on_layers_type.

        Args:
            add_on_layers_type (str): Type of additional layers to add. Can be 'identity' or 'upsample'.
        """

        if add_on_layers_type == "identity":
            self.add_on_layers = nn.Sequential(nn.Identity())
        elif add_on_layers_type == "upsample":
            self.add_on_layers = nn.Upsample(scale_factor=2, mode="bilinear")
        else:
            raise NotImplementedError(
                f"The add-on layer type {add_on_layers_type} isn't implemented yet."
            )

    # TODO
    # def _initialize_weights(self) -> None:
    #     """
    #     Initializes the weights of the add-on layers of the network and the last layer with incorrect connections.
    #
    #     Returns:
    #         None
    #     """
    #
    #     for m in self.add_on_layers.modules():
    #         if isinstance(m, (nn.Conv2d, nn.Linear)):
    #             nn.init.trunc_normal_(m.weight, std=0.02)
    #             if m.bias is not None:
    #                 nn.init.zeros_(m.bias)
    #
    #     # Initialize the last layer with incorrect connections using specified incorrect class connection strength
    #     self.set_last_layer_incorrect_connection(
    #         incorrect_strength=self.incorrect_class_connection
    #     )


class AudioProtoNetPreTrainedModel(PreTrainedModel):
    config_class = AudioProtoNetConfig
    base_model_prefix = "model"

    def _init_weights(self, module):
        if isinstance(module, (nn.Conv2d, nn.Linear)):
            nn.init.trunc_normal_(module.weight, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        if isinstance(module, (nn.Conv2d, nn.Linear)):
            nn.init.trunc_normal_(module.weight, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        if isinstance(module, LinearLayerWithoutNegativeConnections) and self.incorrect_class_connection is None:
            # Initialize all weights to the correct_class_connection value
            self.last_layer.weight.data.fill_(self.correct_class_connection)


class AudioProtoNetModel(AudioProtoNetPreTrainedModel):
    _auto_class = "AutoModel"

    def __init__(self, config: AudioProtoNetConfig):
        super().__init__(config)
        backbone_config = ConvNextConfig.from_pretrained("facebook/convnext-base-224-22k", num_channels=1)
        self.backbone = ConvNextModel(backbone_config)

    def forward(
            self,
            input_values: torch.Tensor,
            output_hidden_states: bool = None
    ) -> BaseModelOutputWithPoolingAndNoAttention:
        """
        Args:
            input_values:
            output_hidden_states:

        Returns:
            last_hidden_state: torch.FloatTensor = None
            pooler_output: torch.FloatTensor = None
            hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

        """
        return self.backbone(input_values, output_hidden_states)


class AudioProtoNetForSequenceClassification(AudioProtoNetPreTrainedModel):
    _auto_class = "AutoModelForSequenceClassification"

    def __init__(self, config: AudioProtoNetConfig):
        super().__init__(config)

        self.model = AudioProtoNetModel(config)
        self.head = AudioProtoNetClassificationHead(config)

    def forward(
            self,
            input_values: torch.Tensor,
            labels: torch.Tensor = None,
            prototypes_of_wrong_class: torch.Tensor = None,
            output_hidden_states: bool = None,
            output_prototypical_activations: bool = None,
    ) -> SequenceClassifierOutputWithProtoTypeActivations:

        backbone_outputs = self.model(input_values, output_hidden_states)

        last_hidden_state = backbone_outputs[0]

        logits, info = self.head(last_hidden_state, prototypes_of_wrong_class)

        loss = None
        if labels is not None:
            labels.to(logits.device)
            loss_fct = AsymmetricLossMultiLabel()
            loss = loss_fct(logits, labels.float())


        hidden_states = None
        if output_hidden_states is not None:
            hidden_states = backbone_outputs[2]

        prototype_activations = None
        if output_prototypical_activations is not None:
            prototype_activations = info[4]

        return SequenceClassifierOutputWithProtoTypeActivations(
            logits=logits,
            loss=loss,
            last_hidden_state=last_hidden_state,
            hidden_states=hidden_states,
            prototype_activations=prototype_activations
        )