| { |
| "step": 5000, |
| "val_bpb": 0.9190692005880656, |
| "model_config": { |
| "sequence_len": 2048, |
| "vocab_size": 65536, |
| "n_layer": 20, |
| "n_head": 10, |
| "n_kv_head": 10, |
| "n_embd": 1280, |
| "n_prelude": 2, |
| "n_recur_block": 4, |
| "n_coda": 2, |
| "train_recur_mean": 4.0, |
| "train_recur_max": 16, |
| "bptt_k": 4 |
| }, |
| "user_config": { |
| "run": "recursive-d20", |
| "device_type": "", |
| "depth": 20, |
| "max_seq_len": 2048, |
| "n_prelude": 2, |
| "n_recur_block": 4, |
| "n_coda": 2, |
| "train_recur_mean": 4.0, |
| "train_recur_max": 16, |
| "bptt_k": 4, |
| "num_iterations": -1, |
| "target_flops": -1.0, |
| "target_param_data_ratio": 34, |
| "device_batch_size": 32, |
| "total_batch_size": 524288, |
| "embedding_lr": 0.2, |
| "unembedding_lr": 0.004, |
| "weight_decay": 0.0, |
| "matrix_lr": 0.02, |
| "grad_clip": 1.0, |
| "warmup_ratio": 0.0, |
| "warmdown_ratio": 0.2, |
| "final_lr_frac": 0.0, |
| "resume_from_step": -1, |
| "eval_every": 250, |
| "eval_tokens": 10485760, |
| "core_metric_every": 2000, |
| "core_metric_max_per_task": 500, |
| "sample_every": 2000, |
| "save_every": 5000, |
| "model_tag": "" |
| }, |
| "device_batch_size": 32, |
| "max_seq_len": 2048, |
| "dataloader_state_dict": { |
| "pq_idx": 46, |
| "rg_idx": 24 |
| }, |
| "loop_state": { |
| "min_val_bpb": 0.9190692005880656, |
| "smooth_train_loss": 3.102558805513702, |
| "total_training_time": 4112.87543463707 |
| } |
| } |