""" Continue-1-OSS Model Configuration """ from transformers import LlamaConfig from transformers.utils import logging logger = logging.get_logger(__name__) class Continue1Config(LlamaConfig): model_type = "continue_oss" def __init__( self, vocab_size=156940, hidden_size=3072, intermediate_size=8192, num_hidden_layers=28, num_attention_heads=24, num_key_value_heads=8, head_dim=128, hidden_act="silu", max_position_embeddings=131072, initializer_range=0.02, rms_norm_eps=1e-05, use_cache=True, pad_token_id=None, bos_token_id=128000, eos_token_id=128001, pretraining_tp=1, tie_word_embeddings=True, rope_theta=500000.0, rope_scaling=None, attention_bias=False, attention_dropout=0.0, mlp_bias=False, **kwargs, ): super().__init__( vocab_size=vocab_size, hidden_size=hidden_size, intermediate_size=intermediate_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, num_key_value_heads=num_key_value_heads, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, initializer_range=initializer_range, rms_norm_eps=rms_norm_eps, use_cache=use_cache, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pretraining_tp=pretraining_tp, tie_word_embeddings=tie_word_embeddings, rope_theta=rope_theta, rope_scaling=rope_scaling, attention_bias=attention_bias, attention_dropout=attention_dropout, **kwargs, ) self.head_dim = head_dim self.mlp_bias = mlp_bias Continue1Config.register_for_auto_class()