"""
GSLM Model Configuration
"""

import json
import os
from typing import Optional
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)


class GSLMConfig(PretrainedConfig):
    """
    Configuration class for GSLM (Generative Spoken Language Model).
    
    This configuration class stores all parameters needed to initialize a GSLMModel.
    """
    
    model_type = "gslm"
    
    def __init__(
        self,
        vocab_size: int = 204,
        d_model: int = 1024,
        nhead: int = 16,
        num_layers: int = 12,
        dim_feedforward: int = 4096,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        max_seq_length: int = 3072,
        pad_idx: int = 204,
        share_input_output_embed: bool = True,
        activation: str = "relu",
        architecture: str = "transformer_lm_big",
        **kwargs
    ):
        """
        Initialize GSLM configuration.
        
        Args:
            vocab_size: Size of the vocabulary
            d_model: Dimensionality of the embeddings and hidden states
            nhead: Number of attention heads
            num_layers: Number of transformer layers
            dim_feedforward: Dimensionality of the feedforward network
            dropout: Dropout probability
            attention_dropout: Dropout probability for attention weights
            max_seq_length: Maximum sequence length
            pad_idx: Padding token index
            share_input_output_embed: Whether to share input and output embeddings
            activation: Activation function ("relu" or "gelu")
            architecture: Model architecture name
        """
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.nhead = nhead
        self.num_layers = num_layers
        self.dim_feedforward = dim_feedforward
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.max_seq_length = max_seq_length
        self.pad_idx = pad_idx
        self.share_input_output_embed = share_input_output_embed
        self.activation = activation
        self.architecture = architecture
        
        super().__init__(**kwargs)