{ "architecture": { "n_embd": 512, "n_layer": 12, "embed_dim": 512, "block_size": 256, "vocab_size": 499 }, "training": { "batch_size": 2, "max_iters": 1000000, "learning_rate": 6e-4, "min_lr": 6e-5, "warmup_iters": 20000, "weight_decay": 0.1, "grad_clip": 1 }, "evaluation": { "eval_interval": 25000, "eval_iters": 100, "save_interval": 25000, "patience": 15 }, "lr_schedule": { "lr_cycle_length": 25000, "lr_cycle_warmup": 10000, "lr_decay_rate": 0.95 }, "scanner": { "scanner_clamp": 70.0 }, "paths": { "checkpoint_path": "checkpoints/colm_checkpoint_big.pt", "best_checkpoint_path": "checkpoints/colm_best_big.pt", "tokenizer_path": "colm_tokenizer.json", "dataset_path": "datasets/DCDM_big_dataset.txt" } }