{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9971106616584802, "eval_steps": 500, "global_step": 864, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.375, "learning_rate": 0.00012, "loss": 1.833, "step": 10 }, { "epoch": 0.05, "grad_norm": 0.28125, "learning_rate": 0.00012, "loss": 1.7119, "step": 20 }, { "epoch": 0.07, "grad_norm": 0.28125, "learning_rate": 0.00012, "loss": 1.7073, "step": 30 }, { "epoch": 0.09, "grad_norm": 0.265625, "learning_rate": 0.00012, "loss": 1.7091, "step": 40 }, { "epoch": 0.12, "grad_norm": 0.259765625, "learning_rate": 0.00012, "loss": 1.711, "step": 50 }, { "epoch": 0.14, "grad_norm": 0.259765625, "learning_rate": 0.00012, "loss": 1.7087, "step": 60 }, { "epoch": 0.16, "grad_norm": 0.259765625, "learning_rate": 0.00012, "loss": 1.6884, "step": 70 }, { "epoch": 0.18, "grad_norm": 0.2470703125, "learning_rate": 0.00012, "loss": 1.6942, "step": 80 }, { "epoch": 0.21, "grad_norm": 0.259765625, "learning_rate": 0.00012, "loss": 1.6918, "step": 90 }, { "epoch": 0.23, "grad_norm": 0.265625, "learning_rate": 0.00012, "loss": 1.7133, "step": 100 }, { "epoch": 0.25, "grad_norm": 0.25390625, "learning_rate": 0.00012, "loss": 1.7099, "step": 110 }, { "epoch": 0.28, "grad_norm": 0.255859375, "learning_rate": 0.00012, "loss": 1.6922, "step": 120 }, { "epoch": 0.3, "grad_norm": 0.2578125, "learning_rate": 0.00012, "loss": 1.6706, "step": 130 }, { "epoch": 0.32, "grad_norm": 0.2451171875, "learning_rate": 0.00012, "loss": 1.6826, "step": 140 }, { "epoch": 0.35, "grad_norm": 0.25, "learning_rate": 0.00012, "loss": 1.68, "step": 150 }, { "epoch": 0.37, "grad_norm": 0.2412109375, "learning_rate": 0.00012, "loss": 1.6542, "step": 160 }, { "epoch": 0.39, "grad_norm": 0.255859375, "learning_rate": 0.00012, "loss": 1.7029, "step": 170 }, { "epoch": 0.42, "grad_norm": 0.267578125, "learning_rate": 0.00012, "loss": 1.6562, "step": 180 }, { "epoch": 0.44, "grad_norm": 0.2490234375, "learning_rate": 0.00012, "loss": 1.6612, "step": 190 }, { "epoch": 0.46, "grad_norm": 0.26171875, "learning_rate": 0.00012, "loss": 1.66, "step": 200 }, { "epoch": 0.49, "grad_norm": 0.244140625, "learning_rate": 0.00012, "loss": 1.678, "step": 210 }, { "epoch": 0.51, "grad_norm": 0.25390625, "learning_rate": 0.00012, "loss": 1.6882, "step": 220 }, { "epoch": 0.53, "grad_norm": 0.251953125, "learning_rate": 0.00012, "loss": 1.6925, "step": 230 }, { "epoch": 0.55, "grad_norm": 0.255859375, "learning_rate": 0.00012, "loss": 1.6788, "step": 240 }, { "epoch": 0.58, "grad_norm": 0.251953125, "learning_rate": 0.00012, "loss": 1.6792, "step": 250 }, { "epoch": 0.6, "grad_norm": 0.25, "learning_rate": 0.00012, "loss": 1.6871, "step": 260 }, { "epoch": 0.62, "grad_norm": 0.26171875, "learning_rate": 0.00012, "loss": 1.7054, "step": 270 }, { "epoch": 0.65, "grad_norm": 0.2578125, "learning_rate": 0.00012, "loss": 1.6526, "step": 280 }, { "epoch": 0.67, "grad_norm": 0.2470703125, "learning_rate": 0.00012, "loss": 1.6842, "step": 290 }, { "epoch": 0.69, "grad_norm": 0.251953125, "learning_rate": 0.00012, "loss": 1.6893, "step": 300 }, { "epoch": 0.72, "grad_norm": 0.2578125, "learning_rate": 0.00012, "loss": 1.6742, "step": 310 }, { "epoch": 0.74, "grad_norm": 0.28515625, "learning_rate": 0.00012, "loss": 1.6668, "step": 320 }, { "epoch": 0.76, "grad_norm": 0.244140625, "learning_rate": 0.00012, "loss": 1.6551, "step": 330 }, { "epoch": 0.79, "grad_norm": 0.2578125, "learning_rate": 0.00012, "loss": 1.6672, "step": 340 }, { "epoch": 0.81, "grad_norm": 0.255859375, "learning_rate": 0.00012, "loss": 1.6478, "step": 350 }, { "epoch": 0.83, "grad_norm": 0.251953125, "learning_rate": 0.00012, "loss": 1.6717, "step": 360 }, { "epoch": 0.86, "grad_norm": 0.2578125, "learning_rate": 0.00012, "loss": 1.6525, "step": 370 }, { "epoch": 0.88, "grad_norm": 0.251953125, "learning_rate": 0.00012, "loss": 1.6667, "step": 380 }, { "epoch": 0.9, "grad_norm": 0.2578125, "learning_rate": 0.00012, "loss": 1.6454, "step": 390 }, { "epoch": 0.92, "grad_norm": 0.265625, "learning_rate": 0.00012, "loss": 1.668, "step": 400 }, { "epoch": 0.95, "grad_norm": 0.265625, "learning_rate": 0.00012, "loss": 1.6635, "step": 410 }, { "epoch": 0.97, "grad_norm": 0.265625, "learning_rate": 0.00012, "loss": 1.6652, "step": 420 }, { "epoch": 0.99, "grad_norm": 0.263671875, "learning_rate": 0.00012, "loss": 1.6901, "step": 430 }, { "epoch": 1.0, "eval_loss": 1.692740559577942, "eval_runtime": 905.4397, "eval_samples_per_second": 0.984, "eval_steps_per_second": 0.984, "step": 432 }, { "epoch": 1.02, "grad_norm": 0.259765625, "learning_rate": 0.00012, "loss": 1.5637, "step": 440 }, { "epoch": 1.04, "grad_norm": 0.2734375, "learning_rate": 0.00012, "loss": 1.5142, "step": 450 }, { "epoch": 1.06, "grad_norm": 0.294921875, "learning_rate": 0.00012, "loss": 1.533, "step": 460 }, { "epoch": 1.09, "grad_norm": 0.296875, "learning_rate": 0.00012, "loss": 1.5256, "step": 470 }, { "epoch": 1.11, "grad_norm": 0.318359375, "learning_rate": 0.00012, "loss": 1.5749, "step": 480 }, { "epoch": 1.13, "grad_norm": 0.302734375, "learning_rate": 0.00012, "loss": 1.5535, "step": 490 }, { "epoch": 1.16, "grad_norm": 0.3359375, "learning_rate": 0.00012, "loss": 1.5754, "step": 500 }, { "epoch": 1.18, "grad_norm": 0.302734375, "learning_rate": 0.00012, "loss": 1.52, "step": 510 }, { "epoch": 1.2, "grad_norm": 0.31640625, "learning_rate": 0.00012, "loss": 1.5559, "step": 520 }, { "epoch": 1.23, "grad_norm": 0.32421875, "learning_rate": 0.00012, "loss": 1.5231, "step": 530 }, { "epoch": 1.25, "grad_norm": 0.3203125, "learning_rate": 0.00012, "loss": 1.5671, "step": 540 }, { "epoch": 1.27, "grad_norm": 0.318359375, "learning_rate": 0.00012, "loss": 1.5529, "step": 550 }, { "epoch": 1.29, "grad_norm": 0.35546875, "learning_rate": 0.00012, "loss": 1.5415, "step": 560 }, { "epoch": 1.32, "grad_norm": 0.33203125, "learning_rate": 0.00012, "loss": 1.5511, "step": 570 }, { "epoch": 1.34, "grad_norm": 0.3359375, "learning_rate": 0.00012, "loss": 1.5398, "step": 580 }, { "epoch": 1.36, "grad_norm": 0.34375, "learning_rate": 0.00012, "loss": 1.5426, "step": 590 }, { "epoch": 1.39, "grad_norm": 0.35546875, "learning_rate": 0.00012, "loss": 1.5334, "step": 600 }, { "epoch": 1.41, "grad_norm": 0.330078125, "learning_rate": 0.00012, "loss": 1.5096, "step": 610 }, { "epoch": 1.43, "grad_norm": 0.333984375, "learning_rate": 0.00012, "loss": 1.5416, "step": 620 }, { "epoch": 1.46, "grad_norm": 0.376953125, "learning_rate": 0.00012, "loss": 1.5343, "step": 630 }, { "epoch": 1.48, "grad_norm": 0.3359375, "learning_rate": 0.00012, "loss": 1.5416, "step": 640 }, { "epoch": 1.5, "grad_norm": 0.33984375, "learning_rate": 0.00012, "loss": 1.5444, "step": 650 }, { "epoch": 1.53, "grad_norm": 0.35546875, "learning_rate": 0.00012, "loss": 1.5112, "step": 660 }, { "epoch": 1.55, "grad_norm": 0.35546875, "learning_rate": 0.00012, "loss": 1.5403, "step": 670 }, { "epoch": 1.57, "grad_norm": 0.337890625, "learning_rate": 0.00012, "loss": 1.532, "step": 680 }, { "epoch": 1.59, "grad_norm": 0.345703125, "learning_rate": 0.00012, "loss": 1.5451, "step": 690 }, { "epoch": 1.62, "grad_norm": 0.337890625, "learning_rate": 0.00012, "loss": 1.5487, "step": 700 }, { "epoch": 1.64, "grad_norm": 0.376953125, "learning_rate": 0.00012, "loss": 1.5529, "step": 710 }, { "epoch": 1.66, "grad_norm": 0.3515625, "learning_rate": 0.00012, "loss": 1.5351, "step": 720 }, { "epoch": 1.69, "grad_norm": 0.33984375, "learning_rate": 0.00012, "loss": 1.5045, "step": 730 }, { "epoch": 1.71, "grad_norm": 0.33984375, "learning_rate": 0.00012, "loss": 1.5431, "step": 740 }, { "epoch": 1.73, "grad_norm": 0.353515625, "learning_rate": 0.00012, "loss": 1.5456, "step": 750 }, { "epoch": 1.76, "grad_norm": 0.353515625, "learning_rate": 0.00012, "loss": 1.537, "step": 760 }, { "epoch": 1.78, "grad_norm": 0.3671875, "learning_rate": 0.00012, "loss": 1.5375, "step": 770 }, { "epoch": 1.8, "grad_norm": 0.345703125, "learning_rate": 0.00012, "loss": 1.5305, "step": 780 }, { "epoch": 1.83, "grad_norm": 0.369140625, "learning_rate": 0.00012, "loss": 1.5455, "step": 790 }, { "epoch": 1.85, "grad_norm": 0.37109375, "learning_rate": 0.00012, "loss": 1.5163, "step": 800 }, { "epoch": 1.87, "grad_norm": 0.349609375, "learning_rate": 0.00012, "loss": 1.5578, "step": 810 }, { "epoch": 1.9, "grad_norm": 0.3515625, "learning_rate": 0.00012, "loss": 1.5191, "step": 820 }, { "epoch": 1.92, "grad_norm": 0.357421875, "learning_rate": 0.00012, "loss": 1.5257, "step": 830 }, { "epoch": 1.94, "grad_norm": 0.35546875, "learning_rate": 0.00012, "loss": 1.527, "step": 840 }, { "epoch": 1.96, "grad_norm": 0.361328125, "learning_rate": 0.00012, "loss": 1.51, "step": 850 }, { "epoch": 1.99, "grad_norm": 0.359375, "learning_rate": 0.00012, "loss": 1.5232, "step": 860 }, { "epoch": 2.0, "eval_loss": 1.692084550857544, "eval_runtime": 905.0512, "eval_samples_per_second": 0.984, "eval_steps_per_second": 0.984, "step": 864 } ], "logging_steps": 10, "max_steps": 864, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 6.110663045346755e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }