model: vocab_size: 50257 d_model: 1024 num_layers: 12 num_heads: 16 num_kv_heads: 4 max_seq_len: 2048 architecture: positional: learned normalization: layer_norm norm_placement: post attention_variant: mha attention_impl: standard ffn_type: swiglu residual: standard embeddings: standard bias: false weight_tying: true initialization: method: normal std: 0.02 embedding_std: 0.02 attention_std: 0.02 mlp_std: 0.02 residual_scale: 1.0 optimizer: type: adamw learning_rate: 3e-4 beta1: 0.9 beta2: 0.95 eps: 1e-8 weight_decay: 0.01 gradient_clip: 1.0 scheduler: type: cosine warmup_steps: 200 min_lr_ratio: 0.1 runtime: seq_len: 1024 micro_batch_per_device: 8 gradient_accumulation: 2 total_tokens: 100000000 eval_interval: 100 log_interval: 10 checkpoint_interval: 200 checkpoint_max_to_keep: 3 checkpoint_dir: checkpoints data: sources: [] max_seq_len: 1024 packing: false eos_between_docs: true pad_to_multiple: 128 tokenizer: algorithm: bpe vocab_size: 50257 pre_tokenizer: byte_level number_tokenization: single_digit output_format: huggingface_fast hardware: accelerator: tpu type: v5e parallelism: data_parallel: 1 model_parallel: 1 compute_dtype: bfloat16 param_dtype: float32 monitoring: tensorboard: false rich_terminal: true