| model: | |
| vocab_size: 50257 | |
| d_model: 1024 | |
| num_layers: 12 | |
| num_heads: 16 | |
| num_kv_heads: 4 | |
| max_seq_len: 2048 | |
| architecture: | |
| positional: learned | |
| normalization: layer_norm | |
| norm_placement: post | |
| attention_variant: mha | |
| attention_impl: standard | |
| ffn_type: swiglu | |
| residual: standard | |
| embeddings: standard | |
| bias: false | |
| weight_tying: true | |
| initialization: | |
| method: normal | |
| std: 0.02 | |
| embedding_std: 0.02 | |
| attention_std: 0.02 | |
| mlp_std: 0.02 | |
| residual_scale: 1.0 | |
| optimizer: | |
| type: adamw | |
| learning_rate: 3e-4 | |
| beta1: 0.9 | |
| beta2: 0.95 | |
| eps: 1e-8 | |
| weight_decay: 0.01 | |
| gradient_clip: 1.0 | |
| scheduler: | |
| type: cosine | |
| warmup_steps: 200 | |
| min_lr_ratio: 0.1 | |
| runtime: | |
| seq_len: 1024 | |
| micro_batch_per_device: 8 | |
| gradient_accumulation: 2 | |
| total_tokens: 100000000 | |
| eval_interval: 100 | |
| log_interval: 10 | |
| checkpoint_interval: 200 | |
| checkpoint_max_to_keep: 3 | |
| checkpoint_dir: checkpoints | |
| data: | |
| sources: [] | |
| max_seq_len: 1024 | |
| packing: false | |
| eos_between_docs: true | |
| pad_to_multiple: 128 | |
| tokenizer: | |
| algorithm: bpe | |
| vocab_size: 50257 | |
| pre_tokenizer: byte_level | |
| number_tokenization: single_digit | |
| output_format: huggingface_fast | |
| hardware: | |
| accelerator: tpu | |
| type: v5e | |
| parallelism: | |
| data_parallel: 1 | |
| model_parallel: 1 | |
| compute_dtype: bfloat16 | |
| param_dtype: float32 | |
| monitoring: | |
| tensorboard: false | |
| rich_terminal: true | |