# VISDOM-32M default config # About 32M trainable parameters with tied token embedding / LM head. model_name: VISDOM-32M vocab_size: 32000 block_size: 256 n_layer: 7 n_head: 8 n_embd: 456 dropout: 0.1 bias: false # Training batch_size: 4 gradient_accumulation_steps: 8 learning_rate: 0.0002 max_iters: 150000 eval_interval: 500 eval_iters: 50 weight_decay: 0.1 warmup_iters: 1000 grad_clip: 1.0 min_lr: 0.00002 # Runtime device: cuda dtype: float16 compile: false seed: 1337 num_workers: 0 # Paths data_dir: data/processed raw_input: data/raw/input.txt checkpoint_dir: checkpoints tokenizer_model: data/processed/visdom_tokenizer.model train_bin: data/processed/train.bin val_bin: data/processed/val.bin meta_file: data/processed/meta.json # Tokenizer character_coverage: 1.0 model_type: bpe val_fraction: 0.1