{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 544, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00919963201471941, "grad_norm": 4.493361949920654, "learning_rate": 7e-06, "loss": 2.5772, "step": 5 }, { "epoch": 0.01839926402943882, "grad_norm": 2.119591236114502, "learning_rate": 1.575e-05, "loss": 2.35, "step": 10 }, { "epoch": 0.027598896044158234, "grad_norm": 0.7079921364784241, "learning_rate": 2.4499999999999996e-05, "loss": 1.9809, "step": 15 }, { "epoch": 0.03679852805887764, "grad_norm": 9.288702964782715, "learning_rate": 3.3249999999999995e-05, "loss": 1.8818, "step": 20 }, { "epoch": 0.045998160073597055, "grad_norm": 0.5875306129455566, "learning_rate": 4.2e-05, "loss": 1.8314, "step": 25 }, { "epoch": 0.05519779208831647, "grad_norm": 0.5792304873466492, "learning_rate": 5.0749999999999994e-05, "loss": 1.6966, "step": 30 }, { "epoch": 0.06439742410303588, "grad_norm": 0.5320371985435486, "learning_rate": 5.9499999999999996e-05, "loss": 1.7159, "step": 35 }, { "epoch": 0.07359705611775529, "grad_norm": 0.41402390599250793, "learning_rate": 6.824999999999999e-05, "loss": 1.6542, "step": 40 }, { "epoch": 0.0827966881324747, "grad_norm": 0.46447646617889404, "learning_rate": 6.98237885462555e-05, "loss": 1.5679, "step": 45 }, { "epoch": 0.09199632014719411, "grad_norm": 0.44029220938682556, "learning_rate": 6.960352422907488e-05, "loss": 1.6419, "step": 50 }, { "epoch": 0.10119595216191353, "grad_norm": 0.425231009721756, "learning_rate": 6.938325991189428e-05, "loss": 1.5516, "step": 55 }, { "epoch": 0.11039558417663294, "grad_norm": 0.4705358147621155, "learning_rate": 6.916299559471366e-05, "loss": 1.572, "step": 60 }, { "epoch": 0.11959521619135234, "grad_norm": 0.5355105400085449, "learning_rate": 6.894273127753303e-05, "loss": 1.5585, "step": 65 }, { "epoch": 0.12879484820607176, "grad_norm": 0.4822757840156555, "learning_rate": 6.872246696035241e-05, "loss": 1.5078, "step": 70 }, { "epoch": 0.13799448022079117, "grad_norm": 0.4612935781478882, "learning_rate": 6.85022026431718e-05, "loss": 1.5677, "step": 75 }, { "epoch": 0.14719411223551057, "grad_norm": 0.5228879451751709, "learning_rate": 6.828193832599119e-05, "loss": 1.5224, "step": 80 }, { "epoch": 0.15639374425023, "grad_norm": 0.4631298780441284, "learning_rate": 6.806167400881057e-05, "loss": 1.4413, "step": 85 }, { "epoch": 0.1655933762649494, "grad_norm": 0.5098944306373596, "learning_rate": 6.784140969162995e-05, "loss": 1.4965, "step": 90 }, { "epoch": 0.17479300827966882, "grad_norm": 0.5739426016807556, "learning_rate": 6.762114537444933e-05, "loss": 1.4526, "step": 95 }, { "epoch": 0.18399264029438822, "grad_norm": 0.5409196019172668, "learning_rate": 6.740088105726871e-05, "loss": 1.4667, "step": 100 }, { "epoch": 0.19319227230910763, "grad_norm": 0.6061224341392517, "learning_rate": 6.71806167400881e-05, "loss": 1.5092, "step": 105 }, { "epoch": 0.20239190432382706, "grad_norm": 4.403282642364502, "learning_rate": 6.696035242290749e-05, "loss": 1.42, "step": 110 }, { "epoch": 0.21159153633854647, "grad_norm": 0.5433526635169983, "learning_rate": 6.674008810572687e-05, "loss": 1.5852, "step": 115 }, { "epoch": 0.22079116835326587, "grad_norm": 0.5382786393165588, "learning_rate": 6.651982378854625e-05, "loss": 1.4788, "step": 120 }, { "epoch": 0.22999080036798528, "grad_norm": 0.5642833709716797, "learning_rate": 6.629955947136563e-05, "loss": 1.4694, "step": 125 }, { "epoch": 0.23919043238270468, "grad_norm": 0.5912417769432068, "learning_rate": 6.607929515418502e-05, "loss": 1.4685, "step": 130 }, { "epoch": 0.24839006439742412, "grad_norm": 0.584441602230072, "learning_rate": 6.58590308370044e-05, "loss": 1.4401, "step": 135 }, { "epoch": 0.2575896964121435, "grad_norm": 0.5686805844306946, "learning_rate": 6.563876651982378e-05, "loss": 1.4622, "step": 140 }, { "epoch": 0.2667893284268629, "grad_norm": 0.5634583234786987, "learning_rate": 6.541850220264316e-05, "loss": 1.4674, "step": 145 }, { "epoch": 0.27598896044158233, "grad_norm": 0.5362507700920105, "learning_rate": 6.519823788546254e-05, "loss": 1.4368, "step": 150 }, { "epoch": 0.28518859245630174, "grad_norm": 0.528346836566925, "learning_rate": 6.497797356828193e-05, "loss": 1.4645, "step": 155 }, { "epoch": 0.29438822447102114, "grad_norm": 0.6441851258277893, "learning_rate": 6.475770925110131e-05, "loss": 1.4359, "step": 160 }, { "epoch": 0.30358785648574055, "grad_norm": 0.6480420231819153, "learning_rate": 6.45374449339207e-05, "loss": 1.5072, "step": 165 }, { "epoch": 0.31278748850046, "grad_norm": 0.6448924541473389, "learning_rate": 6.431718061674008e-05, "loss": 1.3991, "step": 170 }, { "epoch": 0.3219871205151794, "grad_norm": 232.7066650390625, "learning_rate": 6.409691629955947e-05, "loss": 1.4437, "step": 175 }, { "epoch": 0.3311867525298988, "grad_norm": 0.6596648097038269, "learning_rate": 6.387665198237885e-05, "loss": 1.4778, "step": 180 }, { "epoch": 0.3403863845446182, "grad_norm": 0.6211815476417542, "learning_rate": 6.365638766519823e-05, "loss": 1.4235, "step": 185 }, { "epoch": 0.34958601655933763, "grad_norm": 0.597537636756897, "learning_rate": 6.343612334801761e-05, "loss": 1.4416, "step": 190 }, { "epoch": 0.35878564857405704, "grad_norm": 0.5805206894874573, "learning_rate": 6.321585903083701e-05, "loss": 1.4412, "step": 195 }, { "epoch": 0.36798528058877644, "grad_norm": 0.6511718034744263, "learning_rate": 6.299559471365639e-05, "loss": 1.4406, "step": 200 }, { "epoch": 0.37718491260349585, "grad_norm": 0.6224706768989563, "learning_rate": 6.277533039647577e-05, "loss": 1.428, "step": 205 }, { "epoch": 0.38638454461821525, "grad_norm": 0.7181910276412964, "learning_rate": 6.255506607929515e-05, "loss": 1.4363, "step": 210 }, { "epoch": 0.39558417663293466, "grad_norm": 0.6790558695793152, "learning_rate": 6.233480176211453e-05, "loss": 1.4516, "step": 215 }, { "epoch": 0.4047838086476541, "grad_norm": 0.9428783655166626, "learning_rate": 6.211453744493392e-05, "loss": 1.4527, "step": 220 }, { "epoch": 0.4139834406623735, "grad_norm": 0.7599747180938721, "learning_rate": 6.18942731277533e-05, "loss": 1.4756, "step": 225 }, { "epoch": 0.42318307267709293, "grad_norm": 0.5899630784988403, "learning_rate": 6.167400881057268e-05, "loss": 1.4126, "step": 230 }, { "epoch": 0.43238270469181234, "grad_norm": 0.7069624066352844, "learning_rate": 6.145374449339206e-05, "loss": 1.3768, "step": 235 }, { "epoch": 0.44158233670653174, "grad_norm": 1.1761517524719238, "learning_rate": 6.123348017621144e-05, "loss": 1.3948, "step": 240 }, { "epoch": 0.45078196872125115, "grad_norm": 0.8364585041999817, "learning_rate": 6.101321585903083e-05, "loss": 1.4601, "step": 245 }, { "epoch": 0.45998160073597055, "grad_norm": 0.7272374033927917, "learning_rate": 6.0792951541850214e-05, "loss": 1.4054, "step": 250 }, { "epoch": 0.46918123275068996, "grad_norm": 0.722029983997345, "learning_rate": 6.0572687224669595e-05, "loss": 1.4382, "step": 255 }, { "epoch": 0.47838086476540936, "grad_norm": 0.7180947065353394, "learning_rate": 6.0352422907488984e-05, "loss": 1.414, "step": 260 }, { "epoch": 0.48758049678012877, "grad_norm": 0.8569415211677551, "learning_rate": 6.0132158590308366e-05, "loss": 1.3801, "step": 265 }, { "epoch": 0.49678012879484823, "grad_norm": 0.6971564292907715, "learning_rate": 5.991189427312775e-05, "loss": 1.4264, "step": 270 }, { "epoch": 0.5059797608095676, "grad_norm": 1.1400465965270996, "learning_rate": 5.9691629955947136e-05, "loss": 1.4571, "step": 275 }, { "epoch": 0.515179392824287, "grad_norm": 0.6640013456344604, "learning_rate": 5.947136563876652e-05, "loss": 1.4494, "step": 280 }, { "epoch": 0.5243790248390064, "grad_norm": 0.612842857837677, "learning_rate": 5.92511013215859e-05, "loss": 1.3979, "step": 285 }, { "epoch": 0.5335786568537259, "grad_norm": 6.370260715484619, "learning_rate": 5.903083700440528e-05, "loss": 1.4008, "step": 290 }, { "epoch": 0.5427782888684453, "grad_norm": 0.6938475966453552, "learning_rate": 5.881057268722466e-05, "loss": 1.3901, "step": 295 }, { "epoch": 0.5519779208831647, "grad_norm": 0.7252101302146912, "learning_rate": 5.8590308370044045e-05, "loss": 1.4129, "step": 300 }, { "epoch": 0.5611775528978841, "grad_norm": 0.7261694669723511, "learning_rate": 5.837004405286343e-05, "loss": 1.4504, "step": 305 }, { "epoch": 0.5703771849126035, "grad_norm": 1.128242015838623, "learning_rate": 5.8149779735682815e-05, "loss": 1.4392, "step": 310 }, { "epoch": 0.5795768169273229, "grad_norm": 0.7343323230743408, "learning_rate": 5.79295154185022e-05, "loss": 1.4001, "step": 315 }, { "epoch": 0.5887764489420423, "grad_norm": 0.6398953795433044, "learning_rate": 5.770925110132158e-05, "loss": 1.4533, "step": 320 }, { "epoch": 0.5979760809567617, "grad_norm": 0.9343668222427368, "learning_rate": 5.748898678414096e-05, "loss": 1.4464, "step": 325 }, { "epoch": 0.6071757129714811, "grad_norm": 0.7744137644767761, "learning_rate": 5.726872246696035e-05, "loss": 1.3802, "step": 330 }, { "epoch": 0.6163753449862005, "grad_norm": 0.7824112772941589, "learning_rate": 5.704845814977973e-05, "loss": 1.3885, "step": 335 }, { "epoch": 0.62557497700092, "grad_norm": 0.7310079336166382, "learning_rate": 5.682819383259911e-05, "loss": 1.3351, "step": 340 }, { "epoch": 0.6347746090156394, "grad_norm": 0.6795840859413147, "learning_rate": 5.66079295154185e-05, "loss": 1.4371, "step": 345 }, { "epoch": 0.6439742410303588, "grad_norm": 0.7529902458190918, "learning_rate": 5.638766519823788e-05, "loss": 1.4985, "step": 350 }, { "epoch": 0.6531738730450782, "grad_norm": 0.7649794220924377, "learning_rate": 5.6167400881057265e-05, "loss": 1.4007, "step": 355 }, { "epoch": 0.6623735050597976, "grad_norm": 0.8272032141685486, "learning_rate": 5.594713656387665e-05, "loss": 1.4233, "step": 360 }, { "epoch": 0.671573137074517, "grad_norm": 0.7489705085754395, "learning_rate": 5.5726872246696035e-05, "loss": 1.3739, "step": 365 }, { "epoch": 0.6807727690892365, "grad_norm": 0.8254089951515198, "learning_rate": 5.550660792951541e-05, "loss": 1.5, "step": 370 }, { "epoch": 0.6899724011039559, "grad_norm": 0.7875320911407471, "learning_rate": 5.528634361233479e-05, "loss": 1.3998, "step": 375 }, { "epoch": 0.6991720331186753, "grad_norm": 0.7118289470672607, "learning_rate": 5.506607929515418e-05, "loss": 1.4549, "step": 380 }, { "epoch": 0.7083716651333947, "grad_norm": 0.7725639343261719, "learning_rate": 5.484581497797356e-05, "loss": 1.3842, "step": 385 }, { "epoch": 0.7175712971481141, "grad_norm": 1.2431738376617432, "learning_rate": 5.4625550660792944e-05, "loss": 1.4364, "step": 390 }, { "epoch": 0.7267709291628335, "grad_norm": 0.9481919407844543, "learning_rate": 5.4405286343612326e-05, "loss": 1.3786, "step": 395 }, { "epoch": 0.7359705611775529, "grad_norm": 0.7844451069831848, "learning_rate": 5.4185022026431715e-05, "loss": 1.3388, "step": 400 }, { "epoch": 0.7451701931922723, "grad_norm": 0.8081017136573792, "learning_rate": 5.3964757709251096e-05, "loss": 1.4376, "step": 405 }, { "epoch": 0.7543698252069917, "grad_norm": 13030255.0, "learning_rate": 5.374449339207048e-05, "loss": 1.439, "step": 410 }, { "epoch": 0.7635694572217111, "grad_norm": 0.7132210731506348, "learning_rate": 5.352422907488987e-05, "loss": 1.392, "step": 415 }, { "epoch": 0.7727690892364305, "grad_norm": 0.8951236009597778, "learning_rate": 5.330396475770925e-05, "loss": 1.3945, "step": 420 }, { "epoch": 0.7819687212511499, "grad_norm": 0.7916706800460815, "learning_rate": 5.308370044052863e-05, "loss": 1.3567, "step": 425 }, { "epoch": 0.7911683532658693, "grad_norm": 0.8253033757209778, "learning_rate": 5.286343612334801e-05, "loss": 1.3707, "step": 430 }, { "epoch": 0.8003679852805887, "grad_norm": 0.8816981911659241, "learning_rate": 5.26431718061674e-05, "loss": 1.3911, "step": 435 }, { "epoch": 0.8095676172953082, "grad_norm": 0.7913665175437927, "learning_rate": 5.242290748898678e-05, "loss": 1.3613, "step": 440 }, { "epoch": 0.8187672493100276, "grad_norm": 0.7874677777290344, "learning_rate": 5.2202643171806164e-05, "loss": 1.4487, "step": 445 }, { "epoch": 0.827966881324747, "grad_norm": 0.7003588080406189, "learning_rate": 5.1982378854625546e-05, "loss": 1.3712, "step": 450 }, { "epoch": 0.8371665133394665, "grad_norm": 0.7311933040618896, "learning_rate": 5.176211453744493e-05, "loss": 1.3422, "step": 455 }, { "epoch": 0.8463661453541859, "grad_norm": 0.823244571685791, "learning_rate": 5.154185022026431e-05, "loss": 1.3605, "step": 460 }, { "epoch": 0.8555657773689053, "grad_norm": 0.8285578489303589, "learning_rate": 5.132158590308369e-05, "loss": 1.418, "step": 465 }, { "epoch": 0.8647654093836247, "grad_norm": 0.8007466197013855, "learning_rate": 5.110132158590308e-05, "loss": 1.3539, "step": 470 }, { "epoch": 0.8739650413983441, "grad_norm": 0.7975384593009949, "learning_rate": 5.088105726872246e-05, "loss": 1.3959, "step": 475 }, { "epoch": 0.8831646734130635, "grad_norm": 0.931978702545166, "learning_rate": 5.0660792951541843e-05, "loss": 1.3734, "step": 480 }, { "epoch": 0.8923643054277829, "grad_norm": 0.9099324345588684, "learning_rate": 5.044052863436123e-05, "loss": 1.3722, "step": 485 }, { "epoch": 0.9015639374425023, "grad_norm": 0.7978657484054565, "learning_rate": 5.0220264317180614e-05, "loss": 1.3741, "step": 490 }, { "epoch": 0.9107635694572217, "grad_norm": 0.8722144365310669, "learning_rate": 4.9999999999999996e-05, "loss": 1.3869, "step": 495 }, { "epoch": 0.9199632014719411, "grad_norm": 1309.4061279296875, "learning_rate": 4.977973568281938e-05, "loss": 1.4329, "step": 500 }, { "epoch": 0.9291628334866605, "grad_norm": 0.807162880897522, "learning_rate": 4.9559471365638766e-05, "loss": 1.3755, "step": 505 }, { "epoch": 0.9383624655013799, "grad_norm": 0.9719377160072327, "learning_rate": 4.933920704845815e-05, "loss": 1.4036, "step": 510 }, { "epoch": 0.9475620975160993, "grad_norm": 3.86576247215271, "learning_rate": 4.911894273127753e-05, "loss": 1.3866, "step": 515 }, { "epoch": 0.9567617295308187, "grad_norm": 0.8033193945884705, "learning_rate": 4.889867841409692e-05, "loss": 1.3861, "step": 520 }, { "epoch": 0.9659613615455381, "grad_norm": 0.8535065650939941, "learning_rate": 4.867841409691629e-05, "loss": 1.2592, "step": 525 }, { "epoch": 0.9751609935602575, "grad_norm": 0.8155761957168579, "learning_rate": 4.8458149779735675e-05, "loss": 1.3123, "step": 530 }, { "epoch": 0.984360625574977, "grad_norm": 155.9907989501953, "learning_rate": 4.823788546255506e-05, "loss": 1.3567, "step": 535 }, { "epoch": 0.9935602575896965, "grad_norm": 2.225149631500244, "learning_rate": 4.8017621145374445e-05, "loss": 1.3736, "step": 540 } ], "logging_steps": 5, "max_steps": 1629, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.125792658549078e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }