steerling-8b / configuration_steerling.py
AyaGL's picture
Update configuration_steerling.py
d03f691 verified
from transformers import PretrainedConfig
class SteerlingConfig(PretrainedConfig):
model_type = "steerling"
def __init__(
self,
vocab_size=100281,
interpretable=True,
n_layers=32,
n_head=32,
n_embd=4096,
n_kv_heads=4,
block_size=4096,
diff_block_size=64,
use_rms_norm=True,
norm_eps=1e-05,
norm_order="post",
use_qk_norm=True,
use_rope=True,
rope_base=500000.0,
rope_full_precision=True,
clip_qkv=10.0,
mlp_type="swiglu",
activation="gelu",
mlp_ratio=4,
intermediate_size=None,
use_bias=False,
weight_sharing=True,
mask_token_id=100280,
endofchunk_token_id=100279,
n_concepts=33732,
n_unknown_concepts=101196,
concept_dim=4096,
use_attention_known=False,
use_attention_unknown=False,
topk_known=16,
topk_known_features=32,
unknown_topk=128,
use_unknown=True,
apply_topk_to_unknown=True,
topk_on_logits=False,
factorize_unknown=True,
factorize_rank=256,
use_epsilon_correction=True,
concept_block_size=4096,
pad_multiple=16,
store_unknown_weights=False,
inject_layer=16,
inject_alpha=1.0,
**kwargs,
):
self.interpretable = interpretable
self.n_layers = n_layers
self.n_head = n_head
self.n_embd = n_embd
self.n_kv_heads = n_kv_heads
self.block_size = block_size
self.diff_block_size = diff_block_size
self.use_rms_norm = use_rms_norm
self.norm_eps = norm_eps
self.norm_order = norm_order
self.use_qk_norm = use_qk_norm
self.use_rope = use_rope
self.rope_base = rope_base
self.rope_full_precision = rope_full_precision
self.clip_qkv = clip_qkv
self.mlp_type = mlp_type
self.activation = activation
self.mlp_ratio = mlp_ratio
self.intermediate_size = intermediate_size
self.use_bias = use_bias
self.weight_sharing = weight_sharing
self.mask_token_id = mask_token_id
self.endofchunk_token_id = endofchunk_token_id
self.n_concepts = n_concepts
self.n_unknown_concepts = n_unknown_concepts
self.concept_dim = concept_dim
self.use_attention_known = use_attention_known
self.use_attention_unknown = use_attention_unknown
self.topk_known = topk_known
self.topk_known_features = topk_known_features
self.unknown_topk = unknown_topk
self.use_unknown = use_unknown
self.apply_topk_to_unknown = apply_topk_to_unknown
self.topk_on_logits = topk_on_logits
self.factorize_unknown = factorize_unknown
self.factorize_rank = factorize_rank
self.use_epsilon_correction = use_epsilon_correction
self.concept_block_size = concept_block_size
self.pad_multiple = pad_multiple
self.store_unknown_weights = store_unknown_weights
self.inject_layer = inject_layer
self.inject_alpha = inject_alpha
super().__init__(
vocab_size=vocab_size,
pad_token_id=kwargs.pop("pad_token_id", 100277),
bos_token_id=kwargs.pop("bos_token_id", 100278),
eos_token_id=kwargs.pop("eos_token_id", 100257),
**kwargs,
)