|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Configuration class for Unified Language Model |
|
|
HuggingFace Transformers compatible configuration with AutoClass support |
|
|
""" |
|
|
|
|
|
from transformers import PretrainedConfig |
|
|
from typing import Optional |
|
|
|
|
|
class UnifiedModelConfig(PretrainedConfig): |
|
|
""" |
|
|
Configuration class for UnifiedModel. |
|
|
Inherits from PretrainedConfig for full HuggingFace compatibility. |
|
|
""" |
|
|
model_type = "unified_model" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_size: int = None, |
|
|
hidden_size: int = 256, |
|
|
intermediate_size: int = 1024, |
|
|
num_hidden_layers: int = 6, |
|
|
num_attention_heads: int = 8, |
|
|
num_key_value_heads: int = 4, |
|
|
max_position_embeddings: int = 2048, |
|
|
rms_norm_eps: float = 1e-6, |
|
|
rope_theta: float = 10000.0, |
|
|
|
|
|
attention_dropout: float = 0.1, |
|
|
mlp_dropout: float = 0.1, |
|
|
embedding_dropout: float = 0.1, |
|
|
|
|
|
xielu_alpha_p_init: float = 0.8, |
|
|
xielu_alpha_n_init: float = 0.8, |
|
|
xielu_beta: float = 0.5, |
|
|
|
|
|
tie_word_embeddings: bool = True, |
|
|
|
|
|
|
|
|
lax_enabled: bool = True, |
|
|
lax_gate_type: str = "linear", |
|
|
|
|
|
|
|
|
canon_enabled: bool = True, |
|
|
canon_kernel_size: int = 4, |
|
|
canon_a_enabled: bool = True, |
|
|
canon_c_enabled: bool = True, |
|
|
|
|
|
|
|
|
|
|
|
fanformer_p: float = 0.15, |
|
|
|
|
|
|
|
|
pad_token_id: int = None, |
|
|
bos_token_id: int = None, |
|
|
eos_token_id: int = None, |
|
|
|
|
|
**kwargs |
|
|
): |
|
|
super().__init__( |
|
|
pad_token_id=pad_token_id, |
|
|
bos_token_id=bos_token_id, |
|
|
eos_token_id=eos_token_id, |
|
|
tie_word_embeddings=tie_word_embeddings, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
self.vocab_size = vocab_size |
|
|
self.hidden_size = hidden_size |
|
|
self.intermediate_size = intermediate_size |
|
|
self.num_hidden_layers = num_hidden_layers |
|
|
self.num_attention_heads = num_attention_heads |
|
|
self.num_key_value_heads = num_key_value_heads |
|
|
self.max_position_embeddings = max_position_embeddings |
|
|
self.rms_norm_eps = rms_norm_eps |
|
|
self.rope_theta = rope_theta |
|
|
|
|
|
self.attention_dropout = attention_dropout |
|
|
self.mlp_dropout = mlp_dropout |
|
|
self.embedding_dropout = embedding_dropout |
|
|
|
|
|
self.xielu_alpha_p_init = xielu_alpha_p_init |
|
|
self.xielu_alpha_n_init = xielu_alpha_n_init |
|
|
self.xielu_beta = xielu_beta |
|
|
self.tie_word_embeddings = tie_word_embeddings |
|
|
|
|
|
|
|
|
self.lax_enabled = lax_enabled |
|
|
self.lax_gate_type = lax_gate_type |
|
|
|
|
|
|
|
|
self.canon_enabled = canon_enabled |
|
|
self.canon_kernel_size = canon_kernel_size |
|
|
self.canon_a_enabled = canon_a_enabled |
|
|
self.canon_c_enabled = canon_c_enabled |
|
|
|
|
|
|
|
|
self.fanformer_p = fanformer_p |
|
|
|
|
|
|
|
|
self.auto_map = { |
|
|
"AutoConfig": "configuration_unified.UnifiedModelConfig", |
|
|
"AutoModel": "modeling_unified.UnifiedModel", |
|
|
"AutoModelForCausalLM": "modeling_unified.UnifiedModel" |
|
|
} |
|
|
|
|
|
def to_diff_dict(self): |
|
|
""" |
|
|
✅ FIXED: Fuerza la serialización de tie_word_embeddings en config.json |
|
|
|
|
|
Sobreescribe to_diff_dict() para asegurar que tie_word_embeddings |
|
|
siempre aparezca en el config.json, evitando problemas de carga |
|
|
donde HuggingFace no reconoce el weight tying. |
|
|
|
|
|
Returns: |
|
|
Dict: Configuración optimizada con tie_word_embeddings forzado |
|
|
""" |
|
|
|
|
|
output = super().to_diff_dict() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output["tie_word_embeddings"] = self.tie_word_embeddings |
|
|
|
|
|
return output |