LaughLM / configs /gpu_test.yaml
dignity045's picture
Duplicate from Dhiraj45/LaughLM
9639af0
model:
vocab_size: 50257
d_model: 1024
num_layers: 12
num_heads: 16
num_kv_heads: 4
max_seq_len: 2048
architecture:
positional: learned
normalization: layer_norm
norm_placement: post
attention_variant: mha
attention_impl: standard
ffn_type: swiglu
residual: standard
embeddings: standard
bias: false
weight_tying: true
initialization:
method: normal
std: 0.02
embedding_std: 0.02
attention_std: 0.02
mlp_std: 0.02
residual_scale: 1.0
optimizer:
type: adamw
learning_rate: 3e-4
beta1: 0.9
beta2: 0.95
eps: 1e-8
weight_decay: 0.01
gradient_clip: 1.0
scheduler:
type: cosine
warmup_steps: 200
min_lr_ratio: 0.1
runtime:
seq_len: 1024
micro_batch_per_device: 8
gradient_accumulation: 2
total_tokens: 100000000
eval_interval: 100
log_interval: 10
checkpoint_interval: 200
checkpoint_max_to_keep: 3
checkpoint_dir: checkpoints
data:
sources: []
max_seq_len: 1024
packing: false
eos_between_docs: true
pad_to_multiple: 128
tokenizer:
algorithm: bpe
vocab_size: 50257
pre_tokenizer: byte_level
number_tokenization: single_digit
output_format: huggingface_fast
hardware:
accelerator: tpu
type: v5e
parallelism:
data_parallel: 1
model_parallel: 1
compute_dtype: bfloat16
param_dtype: float32
monitoring:
tensorboard: false
rich_terminal: true