| """A HuggingFace-style model configuration for PhoGPT.""" |
| import warnings |
| from typing import Any, Dict, Optional, Union |
| from transformers import PretrainedConfig |
| from .attention import check_alibi_support, is_flash_v1_installed, is_flash_v2_installed |
| from .blocks import attn_config_defaults as phogpt_attn_defaults |
| from .fc import FC_CLASS_REGISTRY |
| from .norm import LPLayerNorm |
| from .ffn import FFN_CLASS_REGISTRY |
| from .warnings import VersionedDeprecationWarning |
|
|
| ffn_config_defaults: Dict = {'ffn_type': 'phogpt_mlp'} |
| init_config_defaults: Dict = { |
| 'name': 'kaiming_normal_', |
| 'fan_mode': 'fan_in', |
| 'init_nonlinearity': 'relu', |
| 'init_div_is_residual': True, |
| 'emb_init_std': None, |
| 'emb_init_uniform_lim': None, |
| 'init_std': None, |
| 'init_gain': 0.0 |
| } |
|
|
| class PhoGPTConfig(PretrainedConfig): |
| model_type = 'phogpt' |
|
|
| def __init__( |
| self, |
| hidden_size: int = 4096, |
| num_attention_heads: int = 32, |
| num_hidden_layers: int = 32, |
| expansion_ratio: Union[int, float] = 4, |
| max_seq_len: int = 4096, |
| vocab_size: int = 51200, |
| resid_pdrop: float = 0.0, |
| emb_pdrop: float = 0.0, |
| learned_pos_emb: bool = True, |
| attn_config: Dict = phogpt_attn_defaults, |
| ffn_config: Dict = ffn_config_defaults, |
| init_device: str = 'cpu', |
| logit_scale: Optional[Union[float, str]] = None, |
| no_bias: bool = False, |
| embedding_fraction: float = 1.0, |
| norm_type: str = 'low_precision_layernorm', |
| use_cache: bool = False, |
| init_config: Dict = init_config_defaults, |
| fc_type: str = 'torch', |
| tie_word_embeddings: bool = True, |
| use_pad_tok_in_ffn: bool = True, |
| **kwargs: Any |
| ): |
| """PhoGPT configuration class. |
| |
| Args: |
| hidden_size (int): Model hidden size (embedding dimension) |
| num_attention_heads (int): Number of attention heads |
| num_hidden_layers (int): Number of transformer layers |
| expansion_ratio (int | float): FFN expansion ratio |
| max_seq_len (int): Max sequence length |
| vocab_size (int): Vocabulary size |
| resid_pdrop (float): Dropout on residuals |
| emb_pdrop (float): Dropout on embeddings |
| learned_pos_emb (bool): Use learned positional embeddings |
| attn_config (dict): Attention configuration dictionary |
| ffn_config (dict): Feedforward network config dictionary |
| init_device (str): Device for initialization |
| logit_scale (float | str): Logit scaling |
| no_bias (bool): Disable biases |
| embedding_fraction (float): Scale embedding gradients |
| norm_type (str): LayerNorm type |
| use_cache (bool): Return past key/value |
| init_config (dict): Weight initialization config |
| fc_type (str): Fully connected layer type ('torch' or 'te') |
| tie_word_embeddings (bool): Tie input/output embeddings |
| use_pad_tok_in_ffn (bool): Forward pad tokens through FFN |
| """ |
| self.hidden_size = hidden_size |
| self.num_attention_heads = num_attention_heads |
| self.num_hidden_layers = num_hidden_layers |
| self.expansion_ratio = expansion_ratio |
| self.max_seq_len = max_seq_len |
| self.vocab_size = vocab_size |
| self.resid_pdrop = resid_pdrop |
| self.emb_pdrop = emb_pdrop |
| self.learned_pos_emb = learned_pos_emb |
| self.attn_config = attn_config |
| self.ffn_config = ffn_config |
| self.init_device = init_device |
| self.logit_scale = logit_scale |
| self.no_bias = no_bias |
| self.embedding_fraction = embedding_fraction |
| self.norm_type = norm_type |
| self.use_cache = use_cache |
| self.init_config = init_config |
| self.fc_type = fc_type |
| self.use_pad_tok_in_ffn = use_pad_tok_in_ffn |
|
|
| if 'name' in kwargs: |
| del kwargs['name'] |
| if 'loss_fn' in kwargs: |
| del kwargs['loss_fn'] |
|
|
| if self.attn_config.get('alibi', False) or self.attn_config.get('rope', False): |
| self.learned_pos_emb = False |
| warnings.warn("alibi or rope is enabled, setting learned_pos_emb to False.") |
|
|
| super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) |
| self._validate_config() |
|
|
| def _set_config_defaults(self, config: Dict[str, Any], config_defaults: Dict[str, Any]) -> Dict[str, Any]: |
| for k, v in config_defaults.items(): |
| if k not in config: |
| config[k] = v |
| elif isinstance(v, dict): |
| config[k] = self._set_config_defaults(config.get(k, {}), v) |
| return config |
|
|
| def _validate_config(self) -> None: |
| self.attn_config = self._set_config_defaults(self.attn_config, phogpt_attn_defaults) |
| self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults) |
| self.init_config = self._set_config_defaults(self.init_config, init_config_defaults) |
|
|
| if self.hidden_size % self.num_attention_heads != 0: |
| raise ValueError("hidden_size must be divisible by num_attention_heads") |
|
|
| for prob in [self.attn_config.get('attn_pdrop', 0.0), self.resid_pdrop, self.emb_pdrop]: |
| if not 0.0 <= prob <= 1.0: |
| raise ValueError("Dropout probabilities must be between 0 and 1") |
|
|
| if self.embedding_fraction > 1 or self.embedding_fraction <= 0: |
| raise ValueError("embedding_fraction must be in (0, 1]") |
|
|
| if not (self.learned_pos_emb or self.attn_config.get('alibi', False) or self.attn_config.get('rope', False)): |
| warnings.warn("No positional encoding used: learned_pos_emb, alibi, or rope should be enabled.") |
|
|
| if self.fc_type == 'te' or self.ffn_config.get('ffn_type') == 'te_ln_mlp': |
| try: |
| import transformer_engine.pytorch as te |
| del te |
| except ImportError: |
| raise ImportError("fc_type='te' requires TransformerEngine installed.") |
|
|
|
|