{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.19085225403093123, "eval_steps": 500, "global_step": 580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016452780519907865, "grad_norm": 1.5217232704162598, "learning_rate": 0.00012, "loss": 4.5224, "step": 5 }, { "epoch": 0.003290556103981573, "grad_norm": 0.574784517288208, "learning_rate": 0.0001998022412656559, "loss": 3.2397, "step": 10 }, { "epoch": 0.004935834155972359, "grad_norm": 0.5069302320480347, "learning_rate": 0.0001994726433750824, "loss": 2.7151, "step": 15 }, { "epoch": 0.006581112207963146, "grad_norm": 0.5865616202354431, "learning_rate": 0.00019914304548450891, "loss": 2.3507, "step": 20 }, { "epoch": 0.008226390259953932, "grad_norm": 0.5909593105316162, "learning_rate": 0.0001988134475939354, "loss": 2.3338, "step": 25 }, { "epoch": 0.009871668311944718, "grad_norm": 0.4913259744644165, "learning_rate": 0.0001984838497033619, "loss": 2.3544, "step": 30 }, { "epoch": 0.011516946363935505, "grad_norm": 0.4331064522266388, "learning_rate": 0.00019815425181278842, "loss": 2.2672, "step": 35 }, { "epoch": 0.013162224415926292, "grad_norm": 0.42509177327156067, "learning_rate": 0.0001978246539222149, "loss": 2.2764, "step": 40 }, { "epoch": 0.014807502467917079, "grad_norm": 0.415414035320282, "learning_rate": 0.0001974950560316414, "loss": 2.1035, "step": 45 }, { "epoch": 0.016452780519907863, "grad_norm": 0.5336166024208069, "learning_rate": 0.0001971654581410679, "loss": 2.2284, "step": 50 }, { "epoch": 0.01809805857189865, "grad_norm": 0.3719511926174164, "learning_rate": 0.0001968358602504944, "loss": 2.1656, "step": 55 }, { "epoch": 0.019743336623889437, "grad_norm": 0.49993693828582764, "learning_rate": 0.00019650626235992092, "loss": 2.0299, "step": 60 }, { "epoch": 0.021388614675880224, "grad_norm": 0.5019952654838562, "learning_rate": 0.0001961766644693474, "loss": 2.0951, "step": 65 }, { "epoch": 0.02303389272787101, "grad_norm": 0.4793168306350708, "learning_rate": 0.0001958470665787739, "loss": 2.1505, "step": 70 }, { "epoch": 0.024679170779861797, "grad_norm": 0.47024038434028625, "learning_rate": 0.0001955174686882004, "loss": 2.054, "step": 75 }, { "epoch": 0.026324448831852584, "grad_norm": 0.5920702815055847, "learning_rate": 0.0001951878707976269, "loss": 2.1802, "step": 80 }, { "epoch": 0.02796972688384337, "grad_norm": 0.4485560655593872, "learning_rate": 0.0001948582729070534, "loss": 2.1609, "step": 85 }, { "epoch": 0.029615004935834157, "grad_norm": 0.41952958703041077, "learning_rate": 0.0001945286750164799, "loss": 2.0584, "step": 90 }, { "epoch": 0.03126028298782494, "grad_norm": 0.40542706847190857, "learning_rate": 0.00019419907712590643, "loss": 2.22, "step": 95 }, { "epoch": 0.03290556103981573, "grad_norm": 0.438912570476532, "learning_rate": 0.0001938694792353329, "loss": 2.1478, "step": 100 }, { "epoch": 0.034550839091806514, "grad_norm": 0.46580272912979126, "learning_rate": 0.0001935398813447594, "loss": 2.1227, "step": 105 }, { "epoch": 0.0361961171437973, "grad_norm": 0.5012261271476746, "learning_rate": 0.0001932102834541859, "loss": 2.1274, "step": 110 }, { "epoch": 0.03784139519578809, "grad_norm": 0.5175459384918213, "learning_rate": 0.0001928806855636124, "loss": 2.0726, "step": 115 }, { "epoch": 0.039486673247778874, "grad_norm": 0.5441685318946838, "learning_rate": 0.00019255108767303892, "loss": 2.1269, "step": 120 }, { "epoch": 0.04113195129976966, "grad_norm": 0.4177902042865753, "learning_rate": 0.0001922214897824654, "loss": 2.164, "step": 125 }, { "epoch": 0.04277722935176045, "grad_norm": 0.44649720191955566, "learning_rate": 0.0001918918918918919, "loss": 2.0883, "step": 130 }, { "epoch": 0.044422507403751234, "grad_norm": 0.46839994192123413, "learning_rate": 0.0001915622940013184, "loss": 2.0448, "step": 135 }, { "epoch": 0.04606778545574202, "grad_norm": 0.4343637228012085, "learning_rate": 0.0001912326961107449, "loss": 2.0282, "step": 140 }, { "epoch": 0.04771306350773281, "grad_norm": 0.4241706132888794, "learning_rate": 0.00019090309822017141, "loss": 2.0895, "step": 145 }, { "epoch": 0.049358341559723594, "grad_norm": 0.44053712487220764, "learning_rate": 0.0001905735003295979, "loss": 2.0925, "step": 150 }, { "epoch": 0.05100361961171438, "grad_norm": 0.39026254415512085, "learning_rate": 0.0001902439024390244, "loss": 2.1519, "step": 155 }, { "epoch": 0.05264889766370517, "grad_norm": 0.455168217420578, "learning_rate": 0.0001899143045484509, "loss": 2.1623, "step": 160 }, { "epoch": 0.054294175715695954, "grad_norm": 0.4873504042625427, "learning_rate": 0.0001895847066578774, "loss": 1.9314, "step": 165 }, { "epoch": 0.05593945376768674, "grad_norm": 0.5435200929641724, "learning_rate": 0.0001892551087673039, "loss": 2.0674, "step": 170 }, { "epoch": 0.05758473181967753, "grad_norm": 0.44813185930252075, "learning_rate": 0.0001889255108767304, "loss": 2.0042, "step": 175 }, { "epoch": 0.059230009871668314, "grad_norm": 0.5993271470069885, "learning_rate": 0.0001885959129861569, "loss": 2.0942, "step": 180 }, { "epoch": 0.0608752879236591, "grad_norm": 0.5044869780540466, "learning_rate": 0.0001882663150955834, "loss": 2.118, "step": 185 }, { "epoch": 0.06252056597564988, "grad_norm": 0.4713231325149536, "learning_rate": 0.00018793671720500988, "loss": 2.1232, "step": 190 }, { "epoch": 0.06416584402764067, "grad_norm": 0.5351199507713318, "learning_rate": 0.0001876071193144364, "loss": 2.0956, "step": 195 }, { "epoch": 0.06581112207963145, "grad_norm": 0.380096971988678, "learning_rate": 0.0001872775214238629, "loss": 2.0466, "step": 200 }, { "epoch": 0.06745640013162224, "grad_norm": 0.4392818510532379, "learning_rate": 0.0001869479235332894, "loss": 2.0503, "step": 205 }, { "epoch": 0.06910167818361303, "grad_norm": 0.49540552496910095, "learning_rate": 0.00018661832564271588, "loss": 2.0771, "step": 210 }, { "epoch": 0.07074695623560381, "grad_norm": 0.5129232406616211, "learning_rate": 0.00018628872775214238, "loss": 1.9724, "step": 215 }, { "epoch": 0.0723922342875946, "grad_norm": 0.4697638750076294, "learning_rate": 0.0001859591298615689, "loss": 2.03, "step": 220 }, { "epoch": 0.07403751233958539, "grad_norm": 0.4250948131084442, "learning_rate": 0.0001856295319709954, "loss": 2.0235, "step": 225 }, { "epoch": 0.07568279039157617, "grad_norm": 0.5197622776031494, "learning_rate": 0.0001852999340804219, "loss": 1.991, "step": 230 }, { "epoch": 0.07732806844356696, "grad_norm": 0.45986393094062805, "learning_rate": 0.0001849703361898484, "loss": 2.0191, "step": 235 }, { "epoch": 0.07897334649555775, "grad_norm": 0.5618834495544434, "learning_rate": 0.00018464073829927487, "loss": 1.9859, "step": 240 }, { "epoch": 0.08061862454754853, "grad_norm": 0.5400542616844177, "learning_rate": 0.0001843111404087014, "loss": 1.9633, "step": 245 }, { "epoch": 0.08226390259953932, "grad_norm": 0.5259667038917542, "learning_rate": 0.0001839815425181279, "loss": 1.8768, "step": 250 }, { "epoch": 0.08390918065153011, "grad_norm": 0.5015618801116943, "learning_rate": 0.0001836519446275544, "loss": 2.0329, "step": 255 }, { "epoch": 0.0855544587035209, "grad_norm": 0.4835856854915619, "learning_rate": 0.0001833223467369809, "loss": 1.9815, "step": 260 }, { "epoch": 0.08719973675551168, "grad_norm": 0.4211411774158478, "learning_rate": 0.0001829927488464074, "loss": 1.9899, "step": 265 }, { "epoch": 0.08884501480750247, "grad_norm": 0.4507792294025421, "learning_rate": 0.0001826631509558339, "loss": 1.9865, "step": 270 }, { "epoch": 0.09049029285949325, "grad_norm": 0.5402964353561401, "learning_rate": 0.00018233355306526038, "loss": 1.902, "step": 275 }, { "epoch": 0.09213557091148404, "grad_norm": 0.4574088454246521, "learning_rate": 0.0001820039551746869, "loss": 1.9727, "step": 280 }, { "epoch": 0.09378084896347483, "grad_norm": 0.4615534842014313, "learning_rate": 0.0001816743572841134, "loss": 2.0582, "step": 285 }, { "epoch": 0.09542612701546561, "grad_norm": 0.5126486420631409, "learning_rate": 0.0001813447593935399, "loss": 2.0274, "step": 290 }, { "epoch": 0.0970714050674564, "grad_norm": 0.6757667660713196, "learning_rate": 0.00018101516150296638, "loss": 1.879, "step": 295 }, { "epoch": 0.09871668311944719, "grad_norm": 0.49488508701324463, "learning_rate": 0.00018068556361239288, "loss": 1.9348, "step": 300 }, { "epoch": 0.10036196117143797, "grad_norm": 0.5860428810119629, "learning_rate": 0.0001803559657218194, "loss": 1.9276, "step": 305 }, { "epoch": 0.10200723922342876, "grad_norm": 0.5148414373397827, "learning_rate": 0.0001800263678312459, "loss": 2.0563, "step": 310 }, { "epoch": 0.10365251727541955, "grad_norm": 0.5046892762184143, "learning_rate": 0.00017969676994067238, "loss": 2.0009, "step": 315 }, { "epoch": 0.10529779532741033, "grad_norm": 0.4465779960155487, "learning_rate": 0.0001793671720500989, "loss": 1.9842, "step": 320 }, { "epoch": 0.10694307337940112, "grad_norm": 0.4488319158554077, "learning_rate": 0.00017903757415952537, "loss": 1.9881, "step": 325 }, { "epoch": 0.10858835143139191, "grad_norm": 0.46680882573127747, "learning_rate": 0.0001787079762689519, "loss": 2.0471, "step": 330 }, { "epoch": 0.1102336294833827, "grad_norm": 0.5483986139297485, "learning_rate": 0.00017837837837837839, "loss": 2.0581, "step": 335 }, { "epoch": 0.11187890753537348, "grad_norm": 0.4938408434391022, "learning_rate": 0.00017804878048780488, "loss": 1.9712, "step": 340 }, { "epoch": 0.11352418558736427, "grad_norm": 0.4176371991634369, "learning_rate": 0.0001777191825972314, "loss": 1.9019, "step": 345 }, { "epoch": 0.11516946363935505, "grad_norm": 0.45936137437820435, "learning_rate": 0.0001773895847066579, "loss": 1.8773, "step": 350 }, { "epoch": 0.11681474169134584, "grad_norm": 0.5166374444961548, "learning_rate": 0.0001770599868160844, "loss": 2.0059, "step": 355 }, { "epoch": 0.11846001974333663, "grad_norm": 0.5485665202140808, "learning_rate": 0.00017673038892551088, "loss": 1.9634, "step": 360 }, { "epoch": 0.12010529779532741, "grad_norm": 0.44683098793029785, "learning_rate": 0.00017640079103493737, "loss": 1.9239, "step": 365 }, { "epoch": 0.1217505758473182, "grad_norm": 0.4426558315753937, "learning_rate": 0.0001760711931443639, "loss": 1.9467, "step": 370 }, { "epoch": 0.12339585389930899, "grad_norm": 0.45059794187545776, "learning_rate": 0.0001757415952537904, "loss": 1.9099, "step": 375 }, { "epoch": 0.12504113195129976, "grad_norm": 0.47326767444610596, "learning_rate": 0.00017541199736321688, "loss": 1.9521, "step": 380 }, { "epoch": 0.12668641000329056, "grad_norm": 0.4886496961116791, "learning_rate": 0.00017508239947264337, "loss": 1.9336, "step": 385 }, { "epoch": 0.12833168805528133, "grad_norm": 0.4394533336162567, "learning_rate": 0.00017475280158206987, "loss": 1.8583, "step": 390 }, { "epoch": 0.12997696610727213, "grad_norm": 0.5217518210411072, "learning_rate": 0.0001744232036914964, "loss": 1.9916, "step": 395 }, { "epoch": 0.1316222441592629, "grad_norm": 0.44888633489608765, "learning_rate": 0.00017409360580092288, "loss": 2.0022, "step": 400 }, { "epoch": 0.1332675222112537, "grad_norm": 0.5385366678237915, "learning_rate": 0.0001737640079103494, "loss": 1.9492, "step": 405 }, { "epoch": 0.13491280026324448, "grad_norm": 0.4314708113670349, "learning_rate": 0.00017343441001977587, "loss": 2.0088, "step": 410 }, { "epoch": 0.13655807831523528, "grad_norm": 0.4006335735321045, "learning_rate": 0.00017310481212920236, "loss": 1.9524, "step": 415 }, { "epoch": 0.13820335636722605, "grad_norm": 0.5291544198989868, "learning_rate": 0.00017277521423862888, "loss": 1.8593, "step": 420 }, { "epoch": 0.13984863441921686, "grad_norm": 0.47129592299461365, "learning_rate": 0.00017244561634805538, "loss": 1.9542, "step": 425 }, { "epoch": 0.14149391247120763, "grad_norm": 0.449595183134079, "learning_rate": 0.0001721160184574819, "loss": 1.9294, "step": 430 }, { "epoch": 0.14313919052319843, "grad_norm": 0.4410437047481537, "learning_rate": 0.0001717864205669084, "loss": 1.9209, "step": 435 }, { "epoch": 0.1447844685751892, "grad_norm": 0.5655415654182434, "learning_rate": 0.00017145682267633488, "loss": 1.9486, "step": 440 }, { "epoch": 0.14642974662718, "grad_norm": 0.5219452381134033, "learning_rate": 0.00017112722478576138, "loss": 1.8922, "step": 445 }, { "epoch": 0.14807502467917077, "grad_norm": 0.49918806552886963, "learning_rate": 0.00017079762689518787, "loss": 1.92, "step": 450 }, { "epoch": 0.14972030273116158, "grad_norm": 0.4410875141620636, "learning_rate": 0.0001704680290046144, "loss": 1.9363, "step": 455 }, { "epoch": 0.15136558078315235, "grad_norm": 0.4709133207798004, "learning_rate": 0.00017013843111404089, "loss": 1.8803, "step": 460 }, { "epoch": 0.15301085883514315, "grad_norm": 0.447390079498291, "learning_rate": 0.00016980883322346738, "loss": 2.0388, "step": 465 }, { "epoch": 0.15465613688713392, "grad_norm": 0.4439023435115814, "learning_rate": 0.00016947923533289387, "loss": 1.9112, "step": 470 }, { "epoch": 0.15630141493912472, "grad_norm": 0.5134996175765991, "learning_rate": 0.00016914963744232037, "loss": 1.9556, "step": 475 }, { "epoch": 0.1579466929911155, "grad_norm": 0.5412283539772034, "learning_rate": 0.0001688200395517469, "loss": 1.899, "step": 480 }, { "epoch": 0.1595919710431063, "grad_norm": 0.46328097581863403, "learning_rate": 0.00016849044166117338, "loss": 1.8476, "step": 485 }, { "epoch": 0.16123724909509707, "grad_norm": 0.43716996908187866, "learning_rate": 0.00016816084377059987, "loss": 1.9059, "step": 490 }, { "epoch": 0.16288252714708787, "grad_norm": 0.4769724905490875, "learning_rate": 0.00016783124588002637, "loss": 1.9186, "step": 495 }, { "epoch": 0.16452780519907864, "grad_norm": 0.5047943592071533, "learning_rate": 0.00016750164798945286, "loss": 1.9161, "step": 500 }, { "epoch": 0.16617308325106944, "grad_norm": 0.4556055963039398, "learning_rate": 0.00016717205009887938, "loss": 1.8834, "step": 505 }, { "epoch": 0.16781836130306022, "grad_norm": 0.4692705571651459, "learning_rate": 0.00016684245220830588, "loss": 1.9091, "step": 510 }, { "epoch": 0.16946363935505102, "grad_norm": 0.43482905626296997, "learning_rate": 0.00016651285431773237, "loss": 1.9405, "step": 515 }, { "epoch": 0.1711089174070418, "grad_norm": 0.5708907246589661, "learning_rate": 0.0001661832564271589, "loss": 1.9027, "step": 520 }, { "epoch": 0.1727541954590326, "grad_norm": 0.49181491136550903, "learning_rate": 0.00016585365853658536, "loss": 1.8526, "step": 525 }, { "epoch": 0.17439947351102336, "grad_norm": 0.5000940561294556, "learning_rate": 0.00016552406064601188, "loss": 1.8588, "step": 530 }, { "epoch": 0.17604475156301416, "grad_norm": 0.45289117097854614, "learning_rate": 0.00016519446275543837, "loss": 2.0358, "step": 535 }, { "epoch": 0.17769002961500494, "grad_norm": 0.5227617621421814, "learning_rate": 0.00016486486486486486, "loss": 1.8512, "step": 540 }, { "epoch": 0.17933530766699574, "grad_norm": 0.512492299079895, "learning_rate": 0.00016453526697429138, "loss": 1.9014, "step": 545 }, { "epoch": 0.1809805857189865, "grad_norm": 0.4832890033721924, "learning_rate": 0.00016420566908371785, "loss": 1.9765, "step": 550 }, { "epoch": 0.18262586377097728, "grad_norm": 0.4797350764274597, "learning_rate": 0.00016387607119314437, "loss": 1.8907, "step": 555 }, { "epoch": 0.18427114182296808, "grad_norm": 0.48889973759651184, "learning_rate": 0.00016354647330257086, "loss": 1.9369, "step": 560 }, { "epoch": 0.18591641987495885, "grad_norm": 0.44188860058784485, "learning_rate": 0.00016321687541199739, "loss": 1.8547, "step": 565 }, { "epoch": 0.18756169792694966, "grad_norm": 0.5300605893135071, "learning_rate": 0.00016288727752142388, "loss": 1.8603, "step": 570 }, { "epoch": 0.18920697597894043, "grad_norm": 0.5411728620529175, "learning_rate": 0.00016255767963085037, "loss": 1.8067, "step": 575 }, { "epoch": 0.19085225403093123, "grad_norm": 0.5871673226356506, "learning_rate": 0.00016222808174027687, "loss": 1.8875, "step": 580 } ], "logging_steps": 5, "max_steps": 3039, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1842159022647296e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }