{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9969620253164557, "eval_steps": 500, "global_step": 986, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020253164556962026, "grad_norm": 2.8830449086777117, "learning_rate": 5e-06, "loss": 0.7568, "step": 10 }, { "epoch": 0.04050632911392405, "grad_norm": 2.303422506767745, "learning_rate": 5e-06, "loss": 0.6504, "step": 20 }, { "epoch": 0.060759493670886074, "grad_norm": 1.6860772420358063, "learning_rate": 5e-06, "loss": 0.6288, "step": 30 }, { "epoch": 0.0810126582278481, "grad_norm": 2.0970267075945785, "learning_rate": 5e-06, "loss": 0.6134, "step": 40 }, { "epoch": 0.10126582278481013, "grad_norm": 1.6054998154511506, "learning_rate": 5e-06, "loss": 0.6037, "step": 50 }, { "epoch": 0.12151898734177215, "grad_norm": 1.838238268065728, "learning_rate": 5e-06, "loss": 0.5983, "step": 60 }, { "epoch": 0.14177215189873418, "grad_norm": 1.6213305380578262, "learning_rate": 5e-06, "loss": 0.5938, "step": 70 }, { "epoch": 0.1620253164556962, "grad_norm": 2.1780011248652023, "learning_rate": 5e-06, "loss": 0.5882, "step": 80 }, { "epoch": 0.18227848101265823, "grad_norm": 2.103534064613827, "learning_rate": 5e-06, "loss": 0.5863, "step": 90 }, { "epoch": 0.20253164556962025, "grad_norm": 1.8335202039731435, "learning_rate": 5e-06, "loss": 0.5888, "step": 100 }, { "epoch": 0.22278481012658227, "grad_norm": 1.523297129174684, "learning_rate": 5e-06, "loss": 0.5809, "step": 110 }, { "epoch": 0.2430379746835443, "grad_norm": 2.0170719153012904, "learning_rate": 5e-06, "loss": 0.5788, "step": 120 }, { "epoch": 0.26329113924050634, "grad_norm": 1.7622257691924685, "learning_rate": 5e-06, "loss": 0.5777, "step": 130 }, { "epoch": 0.28354430379746837, "grad_norm": 1.7181353902048122, "learning_rate": 5e-06, "loss": 0.5672, "step": 140 }, { "epoch": 0.3037974683544304, "grad_norm": 1.4401641944683279, "learning_rate": 5e-06, "loss": 0.5793, "step": 150 }, { "epoch": 0.3240506329113924, "grad_norm": 1.4242865070941313, "learning_rate": 5e-06, "loss": 0.5788, "step": 160 }, { "epoch": 0.34430379746835443, "grad_norm": 1.7980269158065236, "learning_rate": 5e-06, "loss": 0.5794, "step": 170 }, { "epoch": 0.36455696202531646, "grad_norm": 1.4779433518605618, "learning_rate": 5e-06, "loss": 0.5742, "step": 180 }, { "epoch": 0.3848101265822785, "grad_norm": 1.8693708290648419, "learning_rate": 5e-06, "loss": 0.5742, "step": 190 }, { "epoch": 0.4050632911392405, "grad_norm": 1.8949198189001324, "learning_rate": 5e-06, "loss": 0.5695, "step": 200 }, { "epoch": 0.4253164556962025, "grad_norm": 1.435742719558278, "learning_rate": 5e-06, "loss": 0.5649, "step": 210 }, { "epoch": 0.44556962025316454, "grad_norm": 1.5468357652273146, "learning_rate": 5e-06, "loss": 0.5692, "step": 220 }, { "epoch": 0.46582278481012657, "grad_norm": 1.4031725584973305, "learning_rate": 5e-06, "loss": 0.5682, "step": 230 }, { "epoch": 0.4860759493670886, "grad_norm": 1.4275164314242066, "learning_rate": 5e-06, "loss": 0.5643, "step": 240 }, { "epoch": 0.5063291139240507, "grad_norm": 1.2176273594878788, "learning_rate": 5e-06, "loss": 0.5678, "step": 250 }, { "epoch": 0.5265822784810127, "grad_norm": 1.2690186560536825, "learning_rate": 5e-06, "loss": 0.5548, "step": 260 }, { "epoch": 0.5468354430379747, "grad_norm": 1.83660932215292, "learning_rate": 5e-06, "loss": 0.5644, "step": 270 }, { "epoch": 0.5670886075949367, "grad_norm": 1.6169905329132488, "learning_rate": 5e-06, "loss": 0.5593, "step": 280 }, { "epoch": 0.5873417721518988, "grad_norm": 1.4338044643322776, "learning_rate": 5e-06, "loss": 0.5586, "step": 290 }, { "epoch": 0.6075949367088608, "grad_norm": 1.7201385340518724, "learning_rate": 5e-06, "loss": 0.5582, "step": 300 }, { "epoch": 0.6278481012658228, "grad_norm": 1.2567147302918276, "learning_rate": 5e-06, "loss": 0.5637, "step": 310 }, { "epoch": 0.6481012658227848, "grad_norm": 1.2477603311793903, "learning_rate": 5e-06, "loss": 0.5599, "step": 320 }, { "epoch": 0.6683544303797468, "grad_norm": 1.4335133075994282, "learning_rate": 5e-06, "loss": 0.5602, "step": 330 }, { "epoch": 0.6886075949367089, "grad_norm": 1.4578243700932763, "learning_rate": 5e-06, "loss": 0.5604, "step": 340 }, { "epoch": 0.7088607594936709, "grad_norm": 1.2082777377225689, "learning_rate": 5e-06, "loss": 0.5616, "step": 350 }, { "epoch": 0.7291139240506329, "grad_norm": 1.231089840082507, "learning_rate": 5e-06, "loss": 0.5636, "step": 360 }, { "epoch": 0.7493670886075949, "grad_norm": 1.3997981963290846, "learning_rate": 5e-06, "loss": 0.5562, "step": 370 }, { "epoch": 0.769620253164557, "grad_norm": 1.2047345664692388, "learning_rate": 5e-06, "loss": 0.5527, "step": 380 }, { "epoch": 0.789873417721519, "grad_norm": 1.4342172334673526, "learning_rate": 5e-06, "loss": 0.5555, "step": 390 }, { "epoch": 0.810126582278481, "grad_norm": 1.2558989205908657, "learning_rate": 5e-06, "loss": 0.556, "step": 400 }, { "epoch": 0.830379746835443, "grad_norm": 1.2117418919687244, "learning_rate": 5e-06, "loss": 0.5502, "step": 410 }, { "epoch": 0.850632911392405, "grad_norm": 1.185811964971908, "learning_rate": 5e-06, "loss": 0.5581, "step": 420 }, { "epoch": 0.8708860759493671, "grad_norm": 1.1218795286529273, "learning_rate": 5e-06, "loss": 0.5517, "step": 430 }, { "epoch": 0.8911392405063291, "grad_norm": 1.3887849252274234, "learning_rate": 5e-06, "loss": 0.5573, "step": 440 }, { "epoch": 0.9113924050632911, "grad_norm": 1.0476717005428378, "learning_rate": 5e-06, "loss": 0.5485, "step": 450 }, { "epoch": 0.9316455696202531, "grad_norm": 1.06285887248084, "learning_rate": 5e-06, "loss": 0.5527, "step": 460 }, { "epoch": 0.9518987341772152, "grad_norm": 1.1062346230921074, "learning_rate": 5e-06, "loss": 0.5536, "step": 470 }, { "epoch": 0.9721518987341772, "grad_norm": 1.1370240328967087, "learning_rate": 5e-06, "loss": 0.5506, "step": 480 }, { "epoch": 0.9924050632911392, "grad_norm": 1.1112526055158258, "learning_rate": 5e-06, "loss": 0.5472, "step": 490 }, { "epoch": 0.9984810126582279, "eval_loss": 0.06877367943525314, "eval_runtime": 510.2093, "eval_samples_per_second": 26.076, "eval_steps_per_second": 0.408, "step": 493 }, { "epoch": 1.0126582278481013, "grad_norm": 1.896508901425431, "learning_rate": 5e-06, "loss": 0.5062, "step": 500 }, { "epoch": 1.0329113924050632, "grad_norm": 1.29214738627033, "learning_rate": 5e-06, "loss": 0.4713, "step": 510 }, { "epoch": 1.0531645569620254, "grad_norm": 1.3143564009223339, "learning_rate": 5e-06, "loss": 0.4665, "step": 520 }, { "epoch": 1.0734177215189873, "grad_norm": 1.6998357379137725, "learning_rate": 5e-06, "loss": 0.4683, "step": 530 }, { "epoch": 1.0936708860759494, "grad_norm": 1.2775472369900311, "learning_rate": 5e-06, "loss": 0.4641, "step": 540 }, { "epoch": 1.1139240506329113, "grad_norm": 1.2895233559545232, "learning_rate": 5e-06, "loss": 0.4664, "step": 550 }, { "epoch": 1.1341772151898735, "grad_norm": 2.336669368486411, "learning_rate": 5e-06, "loss": 0.4633, "step": 560 }, { "epoch": 1.1544303797468354, "grad_norm": 1.552813991949337, "learning_rate": 5e-06, "loss": 0.4693, "step": 570 }, { "epoch": 1.1746835443037975, "grad_norm": 1.7264516276776805, "learning_rate": 5e-06, "loss": 0.4677, "step": 580 }, { "epoch": 1.1949367088607594, "grad_norm": 1.5668329127595755, "learning_rate": 5e-06, "loss": 0.4611, "step": 590 }, { "epoch": 1.2151898734177216, "grad_norm": 1.5420195271384818, "learning_rate": 5e-06, "loss": 0.4683, "step": 600 }, { "epoch": 1.2354430379746835, "grad_norm": 1.4025799668342696, "learning_rate": 5e-06, "loss": 0.4648, "step": 610 }, { "epoch": 1.2556962025316456, "grad_norm": 1.410087562343117, "learning_rate": 5e-06, "loss": 0.4691, "step": 620 }, { "epoch": 1.2759493670886077, "grad_norm": 1.3134227418822069, "learning_rate": 5e-06, "loss": 0.4699, "step": 630 }, { "epoch": 1.2962025316455696, "grad_norm": 1.265890889850941, "learning_rate": 5e-06, "loss": 0.47, "step": 640 }, { "epoch": 1.3164556962025316, "grad_norm": 1.347312704270352, "learning_rate": 5e-06, "loss": 0.4747, "step": 650 }, { "epoch": 1.3367088607594937, "grad_norm": 1.5665345840182998, "learning_rate": 5e-06, "loss": 0.4637, "step": 660 }, { "epoch": 1.3569620253164558, "grad_norm": 1.4061410250755932, "learning_rate": 5e-06, "loss": 0.4768, "step": 670 }, { "epoch": 1.3772151898734177, "grad_norm": 1.2663454266336562, "learning_rate": 5e-06, "loss": 0.4733, "step": 680 }, { "epoch": 1.3974683544303796, "grad_norm": 1.1821039871004464, "learning_rate": 5e-06, "loss": 0.473, "step": 690 }, { "epoch": 1.4177215189873418, "grad_norm": 1.156981338736933, "learning_rate": 5e-06, "loss": 0.4712, "step": 700 }, { "epoch": 1.437974683544304, "grad_norm": 1.2263612576805232, "learning_rate": 5e-06, "loss": 0.477, "step": 710 }, { "epoch": 1.4582278481012658, "grad_norm": 1.2801673642156481, "learning_rate": 5e-06, "loss": 0.4772, "step": 720 }, { "epoch": 1.4784810126582277, "grad_norm": 1.2617356305246052, "learning_rate": 5e-06, "loss": 0.4799, "step": 730 }, { "epoch": 1.4987341772151899, "grad_norm": 1.2354318184866413, "learning_rate": 5e-06, "loss": 0.4728, "step": 740 }, { "epoch": 1.518987341772152, "grad_norm": 1.2511733882832696, "learning_rate": 5e-06, "loss": 0.4757, "step": 750 }, { "epoch": 1.539240506329114, "grad_norm": 1.2772298337747716, "learning_rate": 5e-06, "loss": 0.4768, "step": 760 }, { "epoch": 1.5594936708860758, "grad_norm": 1.2992839040784614, "learning_rate": 5e-06, "loss": 0.4759, "step": 770 }, { "epoch": 1.579746835443038, "grad_norm": 1.7049378845615897, "learning_rate": 5e-06, "loss": 0.4766, "step": 780 }, { "epoch": 1.6, "grad_norm": 1.6571454832942571, "learning_rate": 5e-06, "loss": 0.4789, "step": 790 }, { "epoch": 1.620253164556962, "grad_norm": 1.9054844094236882, "learning_rate": 5e-06, "loss": 0.4736, "step": 800 }, { "epoch": 1.640506329113924, "grad_norm": 1.5464356008924167, "learning_rate": 5e-06, "loss": 0.466, "step": 810 }, { "epoch": 1.660759493670886, "grad_norm": 1.5111838450770525, "learning_rate": 5e-06, "loss": 0.4743, "step": 820 }, { "epoch": 1.6810126582278482, "grad_norm": 1.42468780507972, "learning_rate": 5e-06, "loss": 0.4703, "step": 830 }, { "epoch": 1.70126582278481, "grad_norm": 1.3971247658469674, "learning_rate": 5e-06, "loss": 0.4711, "step": 840 }, { "epoch": 1.721518987341772, "grad_norm": 1.3361669855974696, "learning_rate": 5e-06, "loss": 0.4734, "step": 850 }, { "epoch": 1.7417721518987341, "grad_norm": 1.2864567717133562, "learning_rate": 5e-06, "loss": 0.4733, "step": 860 }, { "epoch": 1.7620253164556963, "grad_norm": 1.379392846668321, "learning_rate": 5e-06, "loss": 0.4764, "step": 870 }, { "epoch": 1.7822784810126582, "grad_norm": 1.2161972975914068, "learning_rate": 5e-06, "loss": 0.477, "step": 880 }, { "epoch": 1.80253164556962, "grad_norm": 1.1457621158991818, "learning_rate": 5e-06, "loss": 0.4741, "step": 890 }, { "epoch": 1.8227848101265822, "grad_norm": 1.165588356364841, "learning_rate": 5e-06, "loss": 0.4744, "step": 900 }, { "epoch": 1.8430379746835444, "grad_norm": 1.232051152752489, "learning_rate": 5e-06, "loss": 0.477, "step": 910 }, { "epoch": 1.8632911392405065, "grad_norm": 1.2278118260508522, "learning_rate": 5e-06, "loss": 0.4764, "step": 920 }, { "epoch": 1.8835443037974684, "grad_norm": 1.1460939229882365, "learning_rate": 5e-06, "loss": 0.4734, "step": 930 }, { "epoch": 1.9037974683544303, "grad_norm": 1.1502258411053914, "learning_rate": 5e-06, "loss": 0.4775, "step": 940 }, { "epoch": 1.9240506329113924, "grad_norm": 1.3240628061111426, "learning_rate": 5e-06, "loss": 0.4785, "step": 950 }, { "epoch": 1.9443037974683546, "grad_norm": 1.4234146644601138, "learning_rate": 5e-06, "loss": 0.4827, "step": 960 }, { "epoch": 1.9645569620253165, "grad_norm": 1.1410192422559635, "learning_rate": 5e-06, "loss": 0.4807, "step": 970 }, { "epoch": 1.9848101265822784, "grad_norm": 1.233279816619483, "learning_rate": 5e-06, "loss": 0.4813, "step": 980 }, { "epoch": 1.9969620253164557, "eval_loss": 0.06898781657218933, "eval_runtime": 512.3069, "eval_samples_per_second": 25.969, "eval_steps_per_second": 0.406, "step": 986 }, { "epoch": 1.9969620253164557, "step": 986, "total_flos": 1651377344348160.0, "train_loss": 0.5238038238598899, "train_runtime": 56319.7775, "train_samples_per_second": 8.976, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 986, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1651377344348160.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }