# --------------------------------------------------------
# InternVL with Audio Support
# Audio Configuration
# --------------------------------------------------------

import copy
import os
from typing import Union

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)


class AudioConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an Audio Encoder Model. 
    It is used to instantiate an audio encoder according to the specified arguments, 
    defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. 
    Read the documentation from [`PretrainedConfig`] for more information.

    Args:
        speech_encoder (`str`, *optional*, defaults to `"whisper-base"`):
            Path or name of the speech encoder model.
        speech_encoder_type (`str`, *optional*, defaults to `"whisper"`):
            Type of speech encoder to use.
        speech_projector_type (`str`, *optional*, defaults to `"linear"`):
            Type of speech projector to use for feature alignment.
        speech_encoder_ds_rate (`int`, *optional*, defaults to 5):
            Downsampling rate for speech features.
        speech_encoder_hidden_size (`int`, *optional*, defaults to 512):
            Hidden size of the speech encoder.
        mel_bins (`int`, *optional*, defaults to 80):
            Number of mel-frequency bins for spectrogram features.
        sample_rate (`int`, *optional*, defaults to 16000):
            Audio sample rate in Hz.
        frame_length (`float`, *optional*, defaults to 25.0):
            Frame length in milliseconds for audio processing.
        frame_shift (`float`, *optional*, defaults to 10.0):
            Frame shift in milliseconds for audio processing.
        use_beats (`bool`, *optional*, defaults to False):
            Whether to use BEATs model for audio feature extraction.
        beats_model_path (`str`, *optional*, defaults to None):
            Path to BEATs model if use_beats is True.
        whisper_config (`dict`, *optional*, defaults to None):
            Configuration dictionary for Whisper model parameters.
    """

    model_type = 'audio_encoder'

    def __init__(
        self,
        speech_encoder="whisper-base",
        speech_encoder_type="whisper",
        speech_projector_type="linear",
        speech_encoder_ds_rate=5,
        speech_encoder_hidden_size=1280,
        mel_bins=80,
        sample_rate=16000,
        frame_length=25.0,
        frame_shift=10.0,
        use_beats=False,
        beats_model_path=None,
        whisper_config=None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.speech_encoder = speech_encoder
        self.speech_encoder_type = speech_encoder_type
        self.speech_projector_type = speech_projector_type
        self.speech_encoder_ds_rate = speech_encoder_ds_rate
        self.speech_encoder_hidden_size = speech_encoder_hidden_size
        self.mel_bins = mel_bins
        self.sample_rate = sample_rate
        self.frame_length = frame_length
        self.frame_shift = frame_shift
        self.use_beats = use_beats
        self.beats_model_path = beats_model_path
        self.whisper_config = whisper_config or {}

        logger.info(f'Audio Config - Speech Encoder: {self.speech_encoder}')
        logger.info(f'Audio Config - Encoder Type: {self.speech_encoder_type}')
        logger.info(f'Audio Config - Projector Type: {self.speech_projector_type}')
        logger.info(f'Audio Config - Downsampling Rate: {self.speech_encoder_ds_rate}')
        logger.info(f'Audio Config - Hidden Size: {self.speech_encoder_hidden_size}')
        logger.info(f'Audio Config - Mel Bins: {self.mel_bins}')
        logger.info(f'Audio Config - Sample Rate: {self.sample_rate}')

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
        cls._set_token_in_kwargs(kwargs)

        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(config_dict, **kwargs)

    def to_dict(self):
        """
        Serializes this instance to a Python dictionary.
        """
        output = copy.deepcopy(self.__dict__)
        output['model_type'] = self.__class__.model_type
        return output