HanForge-base / configuration_hanforge.py
drlee1's picture
Upload folder using huggingface_hub
a00d81d verified
from __future__ import annotations
from transformers import PretrainedConfig
class HanForgeConfig(PretrainedConfig):
model_type = "hanforge"
# <<< disabled (refactor 20260423, ยง4.1 hybrid local/global attention ๋ฏธ์‚ฌ์šฉ)
# ๋ณด์กด๋œ ์„ค๊ณ„ ์ž์‚ฐ: sliding_window / global_layer_interval / is_global_layer.
# ๋ณธ refactor์—์„œ๋Š” full causal attention๋งŒ ์‚ฌ์šฉํ•œ๋‹ค.
# sliding_window: int = 256
# global_layer_interval: int = 4
# def is_global_layer(self, layer_idx: int) -> bool:
# return layer_idx % self.global_layer_interval == 0
# >>> end disabled
# <<< disabled (refactor 20260423, ยง4.2 YaRN ๋ฏธ์‚ฌ์šฉ)
# rope_scaling / original_max_position_embeddings ๋Š” YaRN ํ™•์žฅ ์ „์ œ ํ•„๋“œ์˜€๋‹ค.
# from-scratch 4k context ํ•™์Šต์—์„œ๋Š” ๋‹จ์ˆœ RoPE ๋กœ ์ถฉ๋ถ„ํ•˜๋‹ค.
# original_max_position_embeddings: int = 4096
# rope_scaling: dict | None = None
# >>> end disabled
def __init__(
self,
vocab_size: int = 32000,
hidden_size: int = 384,
intermediate_size: int = 1024,
num_hidden_layers: int = 8,
num_attention_heads: int = 6,
num_key_value_heads: int = 2,
max_position_embeddings: int = 4096,
rope_theta: float = 50_000.0,
rms_norm_eps: float = 1e-6,
hidden_dropout_prob: float = 0.0,
attention_dropout: float = 0.0,
initializer_range: float = 0.02,
pad_token_id: int = 0,
bos_token_id: int = 1,
eos_token_id: int = 2,
unk_token_id: int = 3,
use_cache: bool = False,
**kwargs,
):
# Back-compat: ๊ณผ๊ฑฐ ์Šคํฌ๋ฆฝํŠธ/์ฒดํฌํฌ์ธํŠธ๊ฐ€ ๋น„ํ™œ์„ฑํ™”๋œ ํ•„๋“œ๋ฅผ ๋„˜๊ธฐ๋”๋ผ๋„ ๋ฌด์‹œํ•œ๋‹ค.
kwargs.pop("sliding_window", None)
kwargs.pop("global_layer_interval", None)
kwargs.pop("original_max_position_embeddings", None)
kwargs.pop("rope_scaling", None)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.max_position_embeddings = max_position_embeddings
self.rope_theta = rope_theta
self.rms_norm_eps = rms_norm_eps
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.use_cache = use_cache
tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
if hidden_size % num_attention_heads != 0:
raise ValueError("hidden_size must be divisible by num_attention_heads")
if num_attention_heads % num_key_value_heads != 0:
raise ValueError("num_attention_heads must be divisible by num_key_value_heads")
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
unk_token_id=unk_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
@property
def head_dim(self) -> int:
return self.hidden_size // self.num_attention_heads
@property
def num_key_value_groups(self) -> int:
return self.num_attention_heads // self.num_key_value_heads