|
|
LLAMA_520M_CONFIG_DICT = dict( |
|
|
|
|
|
|
|
|
vocab_size=8, |
|
|
|
|
|
max_position_embeddings=131072, |
|
|
hidden_size=1024, |
|
|
intermediate_size=4096, |
|
|
num_hidden_layers=30, |
|
|
num_attention_heads=16, |
|
|
attn_implementation="sdpa", |
|
|
head_dim=64, |
|
|
tie_word_embeddings=False, |
|
|
hidden_act="silu", |
|
|
attention_bias=False, |
|
|
attention_dropout=0.0, |
|
|
initializer_range=0.02, |
|
|
mlp_bias=False, |
|
|
model_type="llama", |
|
|
num_key_value_heads=16, |
|
|
pretraining_tp=1, |
|
|
rms_norm_eps=1e-05, |
|
|
rope_scaling=dict( |
|
|
factor=8.0, |
|
|
high_freq_factor=4.0, |
|
|
low_freq_factor=1.0, |
|
|
original_max_position_embeddings=8192, |
|
|
rope_type="llama3" |
|
|
), |
|
|
rope_theta=500000.0, |
|
|
torch_dtype="bfloat16", |
|
|
use_cache=True, |
|
|
) |
|
|
|
|
|
LLAMA_CONFIGS = { |
|
|
"Llama_520M": LLAMA_520M_CONFIG_DICT, |
|
|
} |
|
|
|