| # VISDOM-32M default config | |
| # About 32M trainable parameters with tied token embedding / LM head. | |
| model_name: VISDOM-32M | |
| vocab_size: 32000 | |
| block_size: 256 | |
| n_layer: 7 | |
| n_head: 8 | |
| n_embd: 456 | |
| dropout: 0.1 | |
| bias: false | |
| # Training | |
| batch_size: 4 | |
| gradient_accumulation_steps: 8 | |
| learning_rate: 0.0002 | |
| max_iters: 150000 | |
| eval_interval: 500 | |
| eval_iters: 50 | |
| weight_decay: 0.1 | |
| warmup_iters: 1000 | |
| grad_clip: 1.0 | |
| min_lr: 0.00002 | |
| # Runtime | |
| device: cuda | |
| dtype: float16 | |
| compile: false | |
| seed: 1337 | |
| num_workers: 0 | |
| # Paths | |
| data_dir: data/processed | |
| raw_input: data/raw/input.txt | |
| checkpoint_dir: checkpoints | |
| tokenizer_model: data/processed/visdom_tokenizer.model | |
| train_bin: data/processed/train.bin | |
| val_bin: data/processed/val.bin | |
| meta_file: data/processed/meta.json | |
| # Tokenizer | |
| character_coverage: 1.0 | |
| model_type: bpe | |
| val_fraction: 0.1 | |