happyQuasarv21 / configuration_quasar.py
Bittoby1040's picture
Thank you first commit
b9cb1ba verified
"""Quasar model configuration — HuggingFace compatible.
"""
from transformers.configuration_utils import PreTrainedConfig
class QuasarConfig(PreTrainedConfig):
model_type = "quasar"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
# Core dimensions
vocab_size: int = 248320,
d_model: int = 1536,
n_layers: int = 24,
n_heads: int = 12,
d_ff: int = 4096,
head_dim: int = 128,
max_seq_len: int = 16384,
dropout: float = 0.0,
rms_norm_eps: float = 1e-6,
initializer_range: float = 0.02,
use_cache: bool = True,
tie_word_embeddings: bool = False,
# HF aliases (set automatically)
# hidden_size = d_model, num_hidden_layers = n_layers, etc.
# Hybrid layer config
quasar_layers: int = 4,
gated_layers: int = 2,
use_gla_first: bool = False,
# QuasarAttention params
use_short_conv: bool = True,
conv_size: int = 4,
conv_bias: bool = False,
allow_neg_eigval: bool = False,
attn_mode: str = "chunk",
# GLA params
expand_k: float = 0.5,
expand_v: float = 1.0,
gla_mode: str = "chunk",
# Latent Memory params
memory_slots: int = 128,
memory_dim: int = 128,
# MoE params
moe_type: str = "bigmac",
num_shared_experts: int = 1,
num_routed_experts: int = 64,
top_k: int = 4,
shared_expert_size: int = 3072,
routed_expert_size: int = 256,
dense_input_layers: int = 4,
bigmac_r: float = 0.25,
# MoE stability (SMEBU)
moe_z_loss_coeff: float = 1e-4,
moe_aux_loss_coeff: float = 1e-4,
smebu_kappa: float = 2.0,
smebu_lambda: float = 2e-3,
smebu_beta: float = 0.5,
# Looped transformer
num_loops: int = 1,
use_looped_injection: bool = False,
looped_injection_init: float = 0.1,
# RoPE
rope_theta: float = 1000000.0,
# Training
gradient_checkpointing: bool = False,
residual_scale: float = 0.1,
# FLA compatibility
fuse_norm: bool = True,
fuse_swiglu: bool = True,
fuse_cross_entropy: bool = True,
use_l2warp: bool = False,
hidden_act: str = "silu",
hidden_ratio: int | None = 4,
# Token ids
pad_token_id: int | None = None,
bos_token_id: int = 1,
eos_token_id: int = 2,
**kwargs,
):
self.vocab_size = vocab_size
self.d_model = d_model
self.hidden_size = d_model
self.n_layers = n_layers
self.num_hidden_layers = n_layers
self.n_heads = n_heads
self.num_attention_heads = n_heads
self.num_heads = n_heads # FLA alias
self.d_ff = d_ff
self.intermediate_size = d_ff
self.head_dim = head_dim
self.max_seq_len = max_seq_len
self.max_position_embeddings = max_seq_len
self.dropout = dropout
self.rms_norm_eps = rms_norm_eps
self.norm_eps = rms_norm_eps # FLA alias
self.initializer_range = initializer_range
self.use_cache = use_cache
self.tie_word_embeddings = tie_word_embeddings
# Hybrid layer config
self.quasar_layers = quasar_layers
self.gated_layers = gated_layers
self.use_gla_first = use_gla_first
# layer_types uses HF-allowed values only (for validation)
# hybrid_layer_types stores the actual quasar/gla distinction
# Always force layer_types to HF-safe values, even if quasar/gla passed in
self.hybrid_layer_types = self._build_hybrid_layer_types()
self.layer_types = ["linear_attention"] * self.n_layers
# QuasarAttention params
self.use_short_conv = use_short_conv
self.conv_size = conv_size
self.conv_bias = conv_bias
self.allow_neg_eigval = allow_neg_eigval
self.attn_mode = attn_mode
# GLA params
self.expand_k = expand_k
self.expand_v = expand_v
self.gla_mode = gla_mode
# Latent Memory
self.memory_slots = memory_slots
self.memory_dim = memory_dim
# MoE
self.moe_type = moe_type
self.num_shared_experts = num_shared_experts
self.num_routed_experts = num_routed_experts
self.top_k = top_k
self.shared_expert_size = shared_expert_size
self.routed_expert_size = routed_expert_size
self.dense_input_layers = dense_input_layers
self.bigmac_r = bigmac_r
# SMEBU
self.moe_z_loss_coeff = moe_z_loss_coeff
self.moe_aux_loss_coeff = moe_aux_loss_coeff
self.smebu_kappa = smebu_kappa
self.smebu_lambda = smebu_lambda
self.smebu_beta = smebu_beta
# Looped transformer
self.num_loops = num_loops
self.use_looped_injection = use_looped_injection
self.looped_injection_init = looped_injection_init
# RoPE
self.rope_theta = rope_theta
# Training
self.gradient_checkpointing = gradient_checkpointing
self.residual_scale = residual_scale
# FLA compatibility
self.fuse_norm = fuse_norm
self.fuse_swiglu = fuse_swiglu
self.fuse_cross_entropy = fuse_cross_entropy
self.use_l2warp = use_l2warp
self.hidden_act = hidden_act
self.hidden_ratio = hidden_ratio
# KV heads (for HF compatibility)
self.num_key_value_heads = kwargs.get("num_key_value_heads", n_heads)
self.num_v_heads = kwargs.get("num_v_heads", None)
# Pop layer_types from kwargs to prevent PreTrainedConfig from overriding
# our HF-safe value with quasar/gla from config.json
kwargs.pop("layer_types", None)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _build_hybrid_layer_types(self) -> list[str]:
"""Internal quasar/gla distinction — not validated by HF."""
cycle_len = self.quasar_layers + self.gated_layers
types = []
for i in range(self.n_layers):
pos_in_cycle = i % cycle_len
if self.use_gla_first:
is_quasar = pos_in_cycle >= self.gated_layers
else:
is_quasar = pos_in_cycle < self.quasar_layers
types.append("quasar" if is_quasar else "gla")
return types
__all__ = ["QuasarConfig"]