| { | |
| "model": { | |
| "vocab_size": 4096, | |
| "n_embd": 256, | |
| "n_head": 4, | |
| "n_layer": 4, | |
| "block_size": 256, | |
| "dropout": 0.1 | |
| }, | |
| "finetune": { | |
| "checkpoint": ".data/models/kn1ght-sft/ckpt_005000.pt", | |
| "output_dir": ".data/models/kn1ght-sft-v2", | |
| "n_per_opening": 5, | |
| "temperature": 0.7, | |
| "top_k": 40, | |
| "max_gen_tokens": 80, | |
| "min_half_moves": 6, | |
| "hf_dataset": "InterwebAlchemy/pgn-dataset-including-special-tokens", | |
| "hf_mix_games": 10000, | |
| "batch_size": 32, | |
| "learning_rate": 0.0001, | |
| "min_lr": 1e-05, | |
| "max_iters": 5000, | |
| "warmup_iters": 200, | |
| "grad_clip": 1.0, | |
| "weight_decay": 0.1, | |
| "turn_number_weight": 0.15, | |
| "openings_repeat": 5, | |
| "eval_interval": 500, | |
| "eval_iters": 50, | |
| "log_interval": 50, | |
| "save_interval": 1000, | |
| "seed": 42 | |
| } | |
| } |