damerajee
/

Llamoe-test

Text Generation

Mixture of Experts

meta-llama/Llama-2-7b-hf

syzymon/long_llama_code_7b_instruct

georgesung/llama2_7b_chat_uncensored

togethercomputer/LLaMA-2-7B-32K

Model card Files Files and versions

damerajee commited on Mar 16, 2024

Commit

7fe1774

·

verified ·

1 Parent(s): 5193c45

Update configuration_Llamoe.py

Files changed (1) hide show

configuration_Llamoe.py +9 -10

configuration_Llamoe.py CHANGED Viewed

@@ -13,25 +13,24 @@ LLAMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class LlamoeConfig(PretrainedConfig):
     model_type = "Llamoe"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
         vocab_size=32000,
-        hidden_size=3072,
-        intermediate_size=24576,
-        num_hidden_layers=28,
-        num_attention_heads=16,
-        num_key_value_heads=16,
         head_dim=256,
-        hidden_act="gelu",
-        max_position_embeddings=8192,
         initializer_range=0.02,
-        rms_norm_eps=1e-6,
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
         bos_token_id=2,
-        tie_word_embeddings=True,
         rope_theta=10000.0,
         attention_bias=False,
         attention_dropout=0.0,

 class LlamoeConfig(PretrainedConfig):
     model_type = "Llamoe"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
         vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
         head_dim=256,
+        hidden_act="silu",
+        max_position_embeddings=4096,
         initializer_range=0.02,
+        rms_norm_eps=1e-05,
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
         bos_token_id=2,
+        tie_word_embeddings=false,
         rope_theta=10000.0,
         attention_bias=False,
         attention_dropout=0.0,