File size: 3,410 Bytes
a00d81d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from __future__ import annotations

from transformers import PretrainedConfig


class HanForgeConfig(PretrainedConfig):
    model_type = "hanforge"

    # <<< disabled (refactor 20260423, ยง4.1 hybrid local/global attention ๋ฏธ์‚ฌ์šฉ)
    # ๋ณด์กด๋œ ์„ค๊ณ„ ์ž์‚ฐ: sliding_window / global_layer_interval / is_global_layer.
    # ๋ณธ refactor์—์„œ๋Š” full causal attention๋งŒ ์‚ฌ์šฉํ•œ๋‹ค.
    # sliding_window: int = 256
    # global_layer_interval: int = 4
    # def is_global_layer(self, layer_idx: int) -> bool:
    #     return layer_idx % self.global_layer_interval == 0
    # >>> end disabled

    # <<< disabled (refactor 20260423, ยง4.2 YaRN ๋ฏธ์‚ฌ์šฉ)
    # rope_scaling / original_max_position_embeddings ๋Š” YaRN ํ™•์žฅ ์ „์ œ ํ•„๋“œ์˜€๋‹ค.
    # from-scratch 4k context ํ•™์Šต์—์„œ๋Š” ๋‹จ์ˆœ RoPE ๋กœ ์ถฉ๋ถ„ํ•˜๋‹ค.
    # original_max_position_embeddings: int = 4096
    # rope_scaling: dict | None = None
    # >>> end disabled

    def __init__(
        self,
        vocab_size: int = 32000,
        hidden_size: int = 384,
        intermediate_size: int = 1024,
        num_hidden_layers: int = 8,
        num_attention_heads: int = 6,
        num_key_value_heads: int = 2,
        max_position_embeddings: int = 4096,
        rope_theta: float = 50_000.0,
        rms_norm_eps: float = 1e-6,
        hidden_dropout_prob: float = 0.0,
        attention_dropout: float = 0.0,
        initializer_range: float = 0.02,
        pad_token_id: int = 0,
        bos_token_id: int = 1,
        eos_token_id: int = 2,
        unk_token_id: int = 3,
        use_cache: bool = False,
        **kwargs,
    ):
        # Back-compat: ๊ณผ๊ฑฐ ์Šคํฌ๋ฆฝํŠธ/์ฒดํฌํฌ์ธํŠธ๊ฐ€ ๋น„ํ™œ์„ฑํ™”๋œ ํ•„๋“œ๋ฅผ ๋„˜๊ธฐ๋”๋ผ๋„ ๋ฌด์‹œํ•œ๋‹ค.
        kwargs.pop("sliding_window", None)
        kwargs.pop("global_layer_interval", None)
        kwargs.pop("original_max_position_embeddings", None)
        kwargs.pop("rope_scaling", None)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.max_position_embeddings = max_position_embeddings
        self.rope_theta = rope_theta
        self.rms_norm_eps = rms_norm_eps
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_dropout = attention_dropout
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)

        if hidden_size % num_attention_heads != 0:
            raise ValueError("hidden_size must be divisible by num_attention_heads")
        if num_attention_heads % num_key_value_heads != 0:
            raise ValueError("num_attention_heads must be divisible by num_key_value_heads")

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            unk_token_id=unk_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def head_dim(self) -> int:
        return self.hidden_size // self.num_attention_heads

    @property
    def num_key_value_groups(self) -> int:
        return self.num_attention_heads // self.num_key_value_heads