from __future__ import annotations from transformers import PretrainedConfig class HanForgeConfig(PretrainedConfig): model_type = "hanforge" # <<< disabled (refactor 20260423, §4.1 hybrid local/global attention 미사용) # 보존된 설계 자산: sliding_window / global_layer_interval / is_global_layer. # 본 refactor에서는 full causal attention만 사용한다. # sliding_window: int = 256 # global_layer_interval: int = 4 # def is_global_layer(self, layer_idx: int) -> bool: # return layer_idx % self.global_layer_interval == 0 # >>> end disabled # <<< disabled (refactor 20260423, §4.2 YaRN 미사용) # rope_scaling / original_max_position_embeddings 는 YaRN 확장 전제 필드였다. # from-scratch 4k context 학습에서는 단순 RoPE 로 충분하다. # original_max_position_embeddings: int = 4096 # rope_scaling: dict | None = None # >>> end disabled def __init__( self, vocab_size: int = 32000, hidden_size: int = 384, intermediate_size: int = 1024, num_hidden_layers: int = 8, num_attention_heads: int = 6, num_key_value_heads: int = 2, max_position_embeddings: int = 4096, rope_theta: float = 50_000.0, rms_norm_eps: float = 1e-6, hidden_dropout_prob: float = 0.0, attention_dropout: float = 0.0, initializer_range: float = 0.02, pad_token_id: int = 0, bos_token_id: int = 1, eos_token_id: int = 2, unk_token_id: int = 3, use_cache: bool = False, **kwargs, ): # Back-compat: 과거 스크립트/체크포인트가 비활성화된 필드를 넘기더라도 무시한다. kwargs.pop("sliding_window", None) kwargs.pop("global_layer_interval", None) kwargs.pop("original_max_position_embeddings", None) kwargs.pop("rope_scaling", None) self.vocab_size = vocab_size self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.max_position_embeddings = max_position_embeddings self.rope_theta = rope_theta self.rms_norm_eps = rms_norm_eps self.hidden_dropout_prob = hidden_dropout_prob self.attention_dropout = attention_dropout self.initializer_range = initializer_range self.use_cache = use_cache tie_word_embeddings = kwargs.pop("tie_word_embeddings", True) if hidden_size % num_attention_heads != 0: raise ValueError("hidden_size must be divisible by num_attention_heads") if num_attention_heads % num_key_value_heads != 0: raise ValueError("num_attention_heads must be divisible by num_key_value_heads") super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, unk_token_id=unk_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) @property def head_dim(self) -> int: return self.hidden_size // self.num_attention_heads @property def num_key_value_groups(self) -> int: return self.num_attention_heads // self.num_key_value_heads