from transformers import PretrainedConfig from typing import Literal, Optional from lit_gpt.config import Config class DiffusionLlamaConfig(Config, PretrainedConfig): model_type = "diff_llama_v2" eos_token_id = 2, pad_token_id = 0, mask_token_id = 32000 def __init__( self, block_size: int = 4096, vocab_size: int = 50254, padding_multiple: int = 512, padded_vocab_size: Optional[int] = None, n_layer: int = 16, n_head: int = 32, n_embd: int = 4096, rotary_percentage: float = 0.25, parallel_residual: bool = True, bias: bool = True, n_query_groups: Optional[int] = None, shared_attention_norm: bool = False, _norm_class: Literal["LayerNorm", "RMSNorm", "FusedRMSNorm"] = "LayerNorm", norm_eps: float = 1e-5, _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP", intermediate_size: Optional[int] = None, condense_ratio: int = 1, **kwargs, ): Config.__init__( self, block_size=block_size, vocab_size=vocab_size, padding_multiple=padding_multiple, padded_vocab_size=padded_vocab_size, n_layer=n_layer, n_head=n_head, n_embd=n_embd, rotary_percentage=rotary_percentage, parallel_residual=parallel_residual, bias=bias, n_query_groups=n_query_groups, shared_attention_norm=shared_attention_norm, _norm_class=_norm_class, norm_eps=norm_eps, _mlp_class=_mlp_class, intermediate_size=intermediate_size, condense_ratio=condense_ratio ) PretrainedConfig.__init__(self, **kwargs)