"""Configuration for the Llama 3 baseline transformer. All architectural parameters that vary across model scales or are meaningful research variables are expressed here. Architectural constants of Llama 3 (no bias in linear layers, SwiGLU activation with SiLU gate) are implemented in the relevant modules and documented at the point of use — they are not config parameters because they do not vary across Llama 3 scales and changing them produces a different architecture, not a different scale of this one. RoPE configuration is handled by HuggingFace's RotaryEmbeddingConfigMixin (mixed into PretrainedConfig in transformers 5.x). rope_theta and rope_scaling are passed through to the base class, which validates and standardises them into config.rope_parameters. Do not bypass or duplicate this system. """ from transformers import PretrainedConfig class Llama3Config(PretrainedConfig): """Configuration class for the Llama 3 baseline decoder-only transformer. This config is the single source of truth for every architectural dimension of the model. Nothing in the architecture may use a literal number that belongs here — doing so breaks the library's ability to express different model scales without code changes. RoPE scaling is handled by HuggingFace's rope system. Pass rope_scaling as a dict using HF's format (key is ``rope_type``, not ``type``). Supported types: ``"linear"``, ``"dynamic"``, ``"yarn"``, ``"longrope"``, ``"llama3"``. HF validates the dict and standardises it into ``config.rope_parameters``. Registered with HuggingFace AutoClass via ``auto_map``. Instantiate from the Hub:: config = AutoConfig.from_pretrained( "your-namespace/advanced-transformers-lib", trust_remote_code=True, num_hidden_layers=16, # override any parameter at instantiation time ) model = AutoModelForCausalLM.from_config(config) Args: vocab_size: Vocabulary size. Controls the embedding table and output logits dimension. Must match the tokenizer. hidden_size: Model width. The central dimension from which all others are derived or to which they project. intermediate_size: FFN hidden dimension. Expressed directly rather than derived from a formula because Llama 3 ratios vary by scale (~3.5x at 8B/70B, ~3.25x at 405B). A formula would be wrong for at least some scales. num_hidden_layers: Number of transformer blocks stacked in sequence. num_attention_heads: Number of query heads. Determines how hidden_size is partitioned per head. num_key_value_heads: Number of KV heads for Grouped Query Attention. Must evenly divide num_attention_heads. Setting equal to num_attention_heads gives standard MHA; setting to 1 gives MQA; values between give GQA. Llama 3 uses 8 at all scales, motivated by KV cache memory at 128K context. head_dim: Dimension per attention head. Normally hidden_size // num_attention_heads, but exposed as a parameter for architectures that decouple head count from head size. Computed automatically if None. rms_norm_eps: Epsilon passed to torch.nn.RMSNorm. Prevents division by zero when layer activations are near zero. rope_theta: Base rotation frequency for RoPE. Controls how fast position angles rotate per dimension — higher values mean slower rotation, preventing positional aliasing at long sequence distances. Llama 3 uses 500,000 (vs ~10,000 typical) as a prerequisite for 128K context support. This value has physical meaning tied to the target context length and must never be hardcoded in the architecture. max_position_embeddings: The context length the model was trained at. Used by HF's rope system as original_max_position_embeddings for scaling types that need it (yarn, longrope, llama3). This is the training context length, not an inference ceiling — the rope module handles longer sequences at runtime via lazy cache extension. Llama 3 base training context: 8192. rope_scaling: Optional RoPE scaling configuration for extending context beyond max_position_embeddings without retraining. Pass as a dict in HF's format with ``rope_type`` as the key. HF's RotaryEmbeddingConfigMixin validates and stores this. None means no scaling (default RoPE behaviour). attention_dropout: Dropout probability applied to attention weights. Default 0.0 for deterministic behaviour. use_cache: Whether the model returns past_key_values for KV caching. Set True for inference, may be set False during training to reduce memory pressure. output_hidden_states: Whether the model returns the hidden state tensor after each decoder layer. Useful for probing or intermediate representation extraction. Default False. tie_word_embeddings: Whether the input embedding table and the LM head share weights. False for Llama 3. """ model_type = "llama3_baseline" # auto_map tells HuggingFace which classes to instantiate when loading this config # with trust_remote_code=True. Paths are relative to the Hub repository root, not # the local src/ layout — these are the paths used after HF downloads the files. auto_map = { "AutoConfig": "configuration.Llama3Config", "AutoModelForCausalLM": "huggingface.Llama3ForCausalLM", } def __init__( self, vocab_size: int = 50277, hidden_size: int = 768, intermediate_size: int = 1568, num_hidden_layers: int = 24, num_attention_heads: int = 16, num_key_value_heads: int = 4, head_dim: int | None = None, rms_norm_eps: float = 1e-5, rope_theta: float = 500000.0, max_position_embeddings: int = 8192, rope_scaling: dict | None = None, attention_dropout: float = 0.0, use_cache: bool = True, output_hidden_states: bool = False, tie_word_embeddings: bool = False, **kwargs, ): # Validate structural constraints before storing anything, so that an invalid # config fails loudly at construction rather than silently producing wrong # shapes at forward-pass time. if hidden_size % num_attention_heads != 0: raise ValueError( f"hidden_size ({hidden_size}) must be divisible by " f"num_attention_heads ({num_attention_heads})." ) if num_attention_heads % num_key_value_heads != 0: raise ValueError( f"num_attention_heads ({num_attention_heads}) must be divisible by " f"num_key_value_heads ({num_key_value_heads}). GQA requires query " f"heads to divide evenly across KV head groups." ) # RoPE rotates dimensions in pairs. An odd head_dim has no valid pairing and # produces a cos/sin cache of size head_dim+1 (torch.arange(0, odd, 2) rounds # up), causing a shape mismatch at runtime. Catch it here rather than at # forward-pass time. resolved_head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads if resolved_head_dim % 2 != 0: raise ValueError( f"head_dim must be even (RoPE rotates dimensions in pairs). " f"Got head_dim={resolved_head_dim} from hidden_size={hidden_size} " f"and num_attention_heads={num_attention_heads}." ) # head_dim is normally hidden_size // num_attention_heads but is exposed as a # parameter for architectures that decouple head count from head size. if head_dim is None: head_dim = hidden_size // num_attention_heads self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.head_dim = head_dim self.rms_norm_eps = rms_norm_eps self.attention_dropout = attention_dropout self.use_cache = use_cache # rope_theta, max_position_embeddings, and rope_scaling are passed to HF's # base class, which owns rope configuration via RotaryEmbeddingConfigMixin. # HF validates rope_scaling and standardises everything into rope_parameters. # Do not store or validate these ourselves. super().__init__( rope_theta=rope_theta, max_position_embeddings=max_position_embeddings, rope_scaling=rope_scaling, tie_word_embeddings=tie_word_embeddings, output_hidden_states=output_hidden_states, **kwargs, ) # Promote auto_map to an instance attribute so PretrainedConfig.to_dict() # serialises it into config.json. Class-level attributes are not picked up # by to_dict() — only self.__dict__ is serialised. model_type is the sole # exception handled specially by HF; auto_map is not. self.auto_map = type(self).auto_map