model:
  vocab_size: 50257
  d_model: 1024
  num_layers: 12
  num_heads: 16
  num_kv_heads: 4
  max_seq_len: 2048

architecture:
  positional: learned
  normalization: layer_norm
  norm_placement: post
  attention_variant: mha
  attention_impl: standard
  ffn_type: swiglu
  residual: standard
  embeddings: standard
  bias: false
  weight_tying: true

initialization:
  method: normal
  std: 0.02
  embedding_std: 0.02
  attention_std: 0.02
  mlp_std: 0.02
  residual_scale: 1.0

optimizer:
  type: adamw
  learning_rate: 3e-4
  beta1: 0.9
  beta2: 0.95
  eps: 1e-8
  weight_decay: 0.01
  gradient_clip: 1.0

scheduler:
  type: cosine
  warmup_steps: 200
  min_lr_ratio: 0.1

runtime:
  seq_len: 1024
  micro_batch_per_device: 8
  gradient_accumulation: 2
  total_tokens: 100000000

  eval_interval: 100
  log_interval: 10

  checkpoint_interval: 200
  checkpoint_max_to_keep: 3
  checkpoint_dir: checkpoints

data:
  sources: []
  max_seq_len: 1024
  packing: false
  eos_between_docs: true
  pad_to_multiple: 128

tokenizer:
  algorithm: bpe
  vocab_size: 50257
  pre_tokenizer: byte_level
  number_tokenization: single_digit
  output_format: huggingface_fast

hardware:
  accelerator: tpu
  type: v5e

parallelism:
  data_parallel: 1
  model_parallel: 1
  compute_dtype: bfloat16
  param_dtype: float32

monitoring:
  tensorboard: false
  rich_terminal: true