| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.838365896980462, |
| "eval_steps": 200, |
| "global_step": 800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.035523978685612786, |
| "grad_norm": 30.68755531311035, |
| "learning_rate": 2.1176470588235296e-05, |
| "loss": 10.0284, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07104795737122557, |
| "grad_norm": 11.449149131774902, |
| "learning_rate": 4.470588235294118e-05, |
| "loss": 8.8672, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10657193605683836, |
| "grad_norm": 8.332048416137695, |
| "learning_rate": 6.823529411764707e-05, |
| "loss": 7.2862, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14209591474245115, |
| "grad_norm": 8.64196491241455, |
| "learning_rate": 9.176470588235295e-05, |
| "loss": 5.7446, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.17761989342806395, |
| "grad_norm": 9.184565544128418, |
| "learning_rate": 0.00011529411764705881, |
| "loss": 4.3277, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21314387211367672, |
| "grad_norm": 12.228641510009766, |
| "learning_rate": 0.00013882352941176472, |
| "loss": 2.8672, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.24866785079928952, |
| "grad_norm": 10.162263870239258, |
| "learning_rate": 0.0001623529411764706, |
| "loss": 2.1514, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2841918294849023, |
| "grad_norm": 8.797259330749512, |
| "learning_rate": 0.00018588235294117648, |
| "loss": 1.9459, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3197158081705151, |
| "grad_norm": 6.776567459106445, |
| "learning_rate": 0.00019894875164257555, |
| "loss": 1.3218, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3552397868561279, |
| "grad_norm": 8.384011268615723, |
| "learning_rate": 0.00019632063074901445, |
| "loss": 1.2121, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3907637655417407, |
| "grad_norm": 6.2972564697265625, |
| "learning_rate": 0.00019369250985545335, |
| "loss": 0.9314, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.42628774422735344, |
| "grad_norm": 5.5578179359436035, |
| "learning_rate": 0.00019106438896189225, |
| "loss": 0.7533, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.46181172291296624, |
| "grad_norm": 5.197394847869873, |
| "learning_rate": 0.00018843626806833115, |
| "loss": 0.7108, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.49733570159857904, |
| "grad_norm": 3.50080943107605, |
| "learning_rate": 0.00018580814717477005, |
| "loss": 0.4706, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5328596802841918, |
| "grad_norm": 7.297764301300049, |
| "learning_rate": 0.00018318002628120894, |
| "loss": 0.4651, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5683836589698046, |
| "grad_norm": 3.9045844078063965, |
| "learning_rate": 0.00018055190538764784, |
| "loss": 0.5066, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6039076376554174, |
| "grad_norm": 3.720517873764038, |
| "learning_rate": 0.00017792378449408674, |
| "loss": 0.4518, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6394316163410302, |
| "grad_norm": 2.6637465953826904, |
| "learning_rate": 0.00017529566360052564, |
| "loss": 0.3716, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6749555950266429, |
| "grad_norm": 3.8581767082214355, |
| "learning_rate": 0.00017266754270696454, |
| "loss": 0.2933, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7104795737122558, |
| "grad_norm": 2.2496705055236816, |
| "learning_rate": 0.0001700394218134034, |
| "loss": 0.299, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7104795737122558, |
| "eval_loss": 0.08425440639257431, |
| "eval_runtime": 10.2304, |
| "eval_samples_per_second": 48.874, |
| "eval_steps_per_second": 6.158, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7460035523978685, |
| "grad_norm": 4.2130818367004395, |
| "learning_rate": 0.0001674113009198423, |
| "loss": 0.3048, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7815275310834814, |
| "grad_norm": 2.9314048290252686, |
| "learning_rate": 0.0001647831800262812, |
| "loss": 0.3255, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8170515097690941, |
| "grad_norm": 3.629533052444458, |
| "learning_rate": 0.0001621550591327201, |
| "loss": 0.2614, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8525754884547069, |
| "grad_norm": 2.9846203327178955, |
| "learning_rate": 0.000159526938239159, |
| "loss": 0.2896, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8880994671403197, |
| "grad_norm": 4.864554405212402, |
| "learning_rate": 0.0001568988173455979, |
| "loss": 0.2299, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9236234458259325, |
| "grad_norm": 5.363948345184326, |
| "learning_rate": 0.0001542706964520368, |
| "loss": 0.2208, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9591474245115453, |
| "grad_norm": 7.342677593231201, |
| "learning_rate": 0.0001516425755584757, |
| "loss": 0.2803, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9946714031971581, |
| "grad_norm": 3.756502389907837, |
| "learning_rate": 0.0001490144546649146, |
| "loss": 0.2073, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0284191829484903, |
| "grad_norm": 2.067270278930664, |
| "learning_rate": 0.0001463863337713535, |
| "loss": 0.1461, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.063943161634103, |
| "grad_norm": 1.9786051511764526, |
| "learning_rate": 0.0001437582128777924, |
| "loss": 0.1376, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0994671403197158, |
| "grad_norm": 3.902402877807617, |
| "learning_rate": 0.0001411300919842313, |
| "loss": 0.2187, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1349911190053286, |
| "grad_norm": 8.216059684753418, |
| "learning_rate": 0.0001385019710906702, |
| "loss": 0.1785, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.1705150976909413, |
| "grad_norm": 3.129307270050049, |
| "learning_rate": 0.00013587385019710906, |
| "loss": 0.2076, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.206039076376554, |
| "grad_norm": 1.8454111814498901, |
| "learning_rate": 0.00013324572930354796, |
| "loss": 0.156, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.241563055062167, |
| "grad_norm": 3.502756118774414, |
| "learning_rate": 0.00013061760840998686, |
| "loss": 0.1637, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2770870337477798, |
| "grad_norm": 0.7526670694351196, |
| "learning_rate": 0.00012798948751642576, |
| "loss": 0.1272, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3126110124333925, |
| "grad_norm": 2.0800411701202393, |
| "learning_rate": 0.00012536136662286466, |
| "loss": 0.1372, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3481349911190053, |
| "grad_norm": 2.1678974628448486, |
| "learning_rate": 0.00012273324572930356, |
| "loss": 0.1563, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.383658969804618, |
| "grad_norm": 3.2046022415161133, |
| "learning_rate": 0.00012010512483574245, |
| "loss": 0.1658, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.419182948490231, |
| "grad_norm": 1.8198497295379639, |
| "learning_rate": 0.00011747700394218135, |
| "loss": 0.1619, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.419182948490231, |
| "eval_loss": 0.04036250337958336, |
| "eval_runtime": 9.5151, |
| "eval_samples_per_second": 52.548, |
| "eval_steps_per_second": 6.621, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4547069271758437, |
| "grad_norm": 2.0104610919952393, |
| "learning_rate": 0.00011484888304862025, |
| "loss": 0.1181, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4902309058614565, |
| "grad_norm": 2.408430337905884, |
| "learning_rate": 0.00011222076215505915, |
| "loss": 0.1293, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.5257548845470694, |
| "grad_norm": 1.787520408630371, |
| "learning_rate": 0.00010959264126149803, |
| "loss": 0.1132, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.561278863232682, |
| "grad_norm": 1.8986729383468628, |
| "learning_rate": 0.00010696452036793693, |
| "loss": 0.1296, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.596802841918295, |
| "grad_norm": 2.2806177139282227, |
| "learning_rate": 0.00010433639947437583, |
| "loss": 0.1117, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.6323268206039077, |
| "grad_norm": 2.358901023864746, |
| "learning_rate": 0.00010170827858081473, |
| "loss": 0.1474, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.6678507992895204, |
| "grad_norm": 2.020256280899048, |
| "learning_rate": 9.908015768725362e-05, |
| "loss": 0.1001, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.7033747779751334, |
| "grad_norm": 1.0857290029525757, |
| "learning_rate": 9.645203679369251e-05, |
| "loss": 0.1024, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.738898756660746, |
| "grad_norm": 1.9954943656921387, |
| "learning_rate": 9.382391590013141e-05, |
| "loss": 0.1033, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.7744227353463589, |
| "grad_norm": 2.0594799518585205, |
| "learning_rate": 9.119579500657031e-05, |
| "loss": 0.085, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.8099467140319716, |
| "grad_norm": 2.4020485877990723, |
| "learning_rate": 8.85676741130092e-05, |
| "loss": 0.0943, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.8454706927175843, |
| "grad_norm": 2.926164150238037, |
| "learning_rate": 8.59395532194481e-05, |
| "loss": 0.1207, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.8809946714031973, |
| "grad_norm": 1.3248850107192993, |
| "learning_rate": 8.331143232588699e-05, |
| "loss": 0.1067, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.9165186500888098, |
| "grad_norm": 1.3187663555145264, |
| "learning_rate": 8.068331143232589e-05, |
| "loss": 0.1041, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.9520426287744228, |
| "grad_norm": 3.3121986389160156, |
| "learning_rate": 7.805519053876479e-05, |
| "loss": 0.0672, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.9875666074600356, |
| "grad_norm": 0.5933696627616882, |
| "learning_rate": 7.542706964520369e-05, |
| "loss": 0.084, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.0213143872113677, |
| "grad_norm": 2.998509168624878, |
| "learning_rate": 7.279894875164259e-05, |
| "loss": 0.0631, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.0568383658969807, |
| "grad_norm": 3.291916847229004, |
| "learning_rate": 7.017082785808147e-05, |
| "loss": 0.0829, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.092362344582593, |
| "grad_norm": 1.0062214136123657, |
| "learning_rate": 6.754270696452037e-05, |
| "loss": 0.0753, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.127886323268206, |
| "grad_norm": 1.5099824666976929, |
| "learning_rate": 6.491458607095927e-05, |
| "loss": 0.0921, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.127886323268206, |
| "eval_loss": 0.026381533592939377, |
| "eval_runtime": 8.9034, |
| "eval_samples_per_second": 56.158, |
| "eval_steps_per_second": 7.076, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.1634103019538187, |
| "grad_norm": 0.8741039037704468, |
| "learning_rate": 6.228646517739817e-05, |
| "loss": 0.0499, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.1989342806394316, |
| "grad_norm": 2.0591001510620117, |
| "learning_rate": 5.9658344283837066e-05, |
| "loss": 0.0837, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.2344582593250446, |
| "grad_norm": 0.9756016135215759, |
| "learning_rate": 5.703022339027596e-05, |
| "loss": 0.057, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.269982238010657, |
| "grad_norm": 1.108465552330017, |
| "learning_rate": 5.440210249671485e-05, |
| "loss": 0.0676, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.30550621669627, |
| "grad_norm": 1.4851471185684204, |
| "learning_rate": 5.177398160315374e-05, |
| "loss": 0.0597, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.3410301953818826, |
| "grad_norm": 0.6145038604736328, |
| "learning_rate": 4.9145860709592646e-05, |
| "loss": 0.0633, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.3765541740674956, |
| "grad_norm": 0.6076750755310059, |
| "learning_rate": 4.651773981603154e-05, |
| "loss": 0.063, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.412078152753108, |
| "grad_norm": 0.9240069389343262, |
| "learning_rate": 4.388961892247044e-05, |
| "loss": 0.0586, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.447602131438721, |
| "grad_norm": 0.587311863899231, |
| "learning_rate": 4.1261498028909335e-05, |
| "loss": 0.0549, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.483126110124334, |
| "grad_norm": 2.3599936962127686, |
| "learning_rate": 3.863337713534823e-05, |
| "loss": 0.0676, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.5186500888099466, |
| "grad_norm": 0.4543835520744324, |
| "learning_rate": 3.600525624178712e-05, |
| "loss": 0.0664, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.5541740674955595, |
| "grad_norm": 5.0481486320495605, |
| "learning_rate": 3.337713534822602e-05, |
| "loss": 0.0832, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.589698046181172, |
| "grad_norm": 2.4198014736175537, |
| "learning_rate": 3.0749014454664916e-05, |
| "loss": 0.062, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.625222024866785, |
| "grad_norm": 0.744985818862915, |
| "learning_rate": 2.812089356110381e-05, |
| "loss": 0.0424, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.660746003552398, |
| "grad_norm": 1.223203420639038, |
| "learning_rate": 2.549277266754271e-05, |
| "loss": 0.0657, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.6962699822380105, |
| "grad_norm": 0.9262439608573914, |
| "learning_rate": 2.2864651773981605e-05, |
| "loss": 0.0541, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.7317939609236235, |
| "grad_norm": 0.6838471293449402, |
| "learning_rate": 2.02365308804205e-05, |
| "loss": 0.061, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.767317939609236, |
| "grad_norm": 1.230035662651062, |
| "learning_rate": 1.7608409986859398e-05, |
| "loss": 0.0609, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.802841918294849, |
| "grad_norm": 1.1633126735687256, |
| "learning_rate": 1.4980289093298292e-05, |
| "loss": 0.0878, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.838365896980462, |
| "grad_norm": 1.7632249593734741, |
| "learning_rate": 1.2352168199737188e-05, |
| "loss": 0.0354, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.838365896980462, |
| "eval_loss": 0.02178078517317772, |
| "eval_runtime": 9.1025, |
| "eval_samples_per_second": 54.93, |
| "eval_steps_per_second": 6.921, |
| "step": 800 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 846, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.238828501826765e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|