{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2632444883185258, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016452780519907865, "grad_norm": 1.5217232704162598, "learning_rate": 0.00012, "loss": 4.5224, "step": 5 }, { "epoch": 0.003290556103981573, "grad_norm": 0.574784517288208, "learning_rate": 0.0001998022412656559, "loss": 3.2397, "step": 10 }, { "epoch": 0.004935834155972359, "grad_norm": 0.5069302320480347, "learning_rate": 0.0001994726433750824, "loss": 2.7151, "step": 15 }, { "epoch": 0.006581112207963146, "grad_norm": 0.5865616202354431, "learning_rate": 0.00019914304548450891, "loss": 2.3507, "step": 20 }, { "epoch": 0.008226390259953932, "grad_norm": 0.5909593105316162, "learning_rate": 0.0001988134475939354, "loss": 2.3338, "step": 25 }, { "epoch": 0.009871668311944718, "grad_norm": 0.4913259744644165, "learning_rate": 0.0001984838497033619, "loss": 2.3544, "step": 30 }, { "epoch": 0.011516946363935505, "grad_norm": 0.4331064522266388, "learning_rate": 0.00019815425181278842, "loss": 2.2672, "step": 35 }, { "epoch": 0.013162224415926292, "grad_norm": 0.42509177327156067, "learning_rate": 0.0001978246539222149, "loss": 2.2764, "step": 40 }, { "epoch": 0.014807502467917079, "grad_norm": 0.415414035320282, "learning_rate": 0.0001974950560316414, "loss": 2.1035, "step": 45 }, { "epoch": 0.016452780519907863, "grad_norm": 0.5336166024208069, "learning_rate": 0.0001971654581410679, "loss": 2.2284, "step": 50 }, { "epoch": 0.01809805857189865, "grad_norm": 0.3719511926174164, "learning_rate": 0.0001968358602504944, "loss": 2.1656, "step": 55 }, { "epoch": 0.019743336623889437, "grad_norm": 0.49993693828582764, "learning_rate": 0.00019650626235992092, "loss": 2.0299, "step": 60 }, { "epoch": 0.021388614675880224, "grad_norm": 0.5019952654838562, "learning_rate": 0.0001961766644693474, "loss": 2.0951, "step": 65 }, { "epoch": 0.02303389272787101, "grad_norm": 0.4793168306350708, "learning_rate": 0.0001958470665787739, "loss": 2.1505, "step": 70 }, { "epoch": 0.024679170779861797, "grad_norm": 0.47024038434028625, "learning_rate": 0.0001955174686882004, "loss": 2.054, "step": 75 }, { "epoch": 0.026324448831852584, "grad_norm": 0.5920702815055847, "learning_rate": 0.0001951878707976269, "loss": 2.1802, "step": 80 }, { "epoch": 0.02796972688384337, "grad_norm": 0.4485560655593872, "learning_rate": 0.0001948582729070534, "loss": 2.1609, "step": 85 }, { "epoch": 0.029615004935834157, "grad_norm": 0.41952958703041077, "learning_rate": 0.0001945286750164799, "loss": 2.0584, "step": 90 }, { "epoch": 0.03126028298782494, "grad_norm": 0.40542706847190857, "learning_rate": 0.00019419907712590643, "loss": 2.22, "step": 95 }, { "epoch": 0.03290556103981573, "grad_norm": 0.438912570476532, "learning_rate": 0.0001938694792353329, "loss": 2.1478, "step": 100 }, { "epoch": 0.034550839091806514, "grad_norm": 0.46580272912979126, "learning_rate": 0.0001935398813447594, "loss": 2.1227, "step": 105 }, { "epoch": 0.0361961171437973, "grad_norm": 0.5012261271476746, "learning_rate": 0.0001932102834541859, "loss": 2.1274, "step": 110 }, { "epoch": 0.03784139519578809, "grad_norm": 0.5175459384918213, "learning_rate": 0.0001928806855636124, "loss": 2.0726, "step": 115 }, { "epoch": 0.039486673247778874, "grad_norm": 0.5441685318946838, "learning_rate": 0.00019255108767303892, "loss": 2.1269, "step": 120 }, { "epoch": 0.04113195129976966, "grad_norm": 0.4177902042865753, "learning_rate": 0.0001922214897824654, "loss": 2.164, "step": 125 }, { "epoch": 0.04277722935176045, "grad_norm": 0.44649720191955566, "learning_rate": 0.0001918918918918919, "loss": 2.0883, "step": 130 }, { "epoch": 0.044422507403751234, "grad_norm": 0.46839994192123413, "learning_rate": 0.0001915622940013184, "loss": 2.0448, "step": 135 }, { "epoch": 0.04606778545574202, "grad_norm": 0.4343637228012085, "learning_rate": 0.0001912326961107449, "loss": 2.0282, "step": 140 }, { "epoch": 0.04771306350773281, "grad_norm": 0.4241706132888794, "learning_rate": 0.00019090309822017141, "loss": 2.0895, "step": 145 }, { "epoch": 0.049358341559723594, "grad_norm": 0.44053712487220764, "learning_rate": 0.0001905735003295979, "loss": 2.0925, "step": 150 }, { "epoch": 0.05100361961171438, "grad_norm": 0.39026254415512085, "learning_rate": 0.0001902439024390244, "loss": 2.1519, "step": 155 }, { "epoch": 0.05264889766370517, "grad_norm": 0.455168217420578, "learning_rate": 0.0001899143045484509, "loss": 2.1623, "step": 160 }, { "epoch": 0.054294175715695954, "grad_norm": 0.4873504042625427, "learning_rate": 0.0001895847066578774, "loss": 1.9314, "step": 165 }, { "epoch": 0.05593945376768674, "grad_norm": 0.5435200929641724, "learning_rate": 0.0001892551087673039, "loss": 2.0674, "step": 170 }, { "epoch": 0.05758473181967753, "grad_norm": 0.44813185930252075, "learning_rate": 0.0001889255108767304, "loss": 2.0042, "step": 175 }, { "epoch": 0.059230009871668314, "grad_norm": 0.5993271470069885, "learning_rate": 0.0001885959129861569, "loss": 2.0942, "step": 180 }, { "epoch": 0.0608752879236591, "grad_norm": 0.5044869780540466, "learning_rate": 0.0001882663150955834, "loss": 2.118, "step": 185 }, { "epoch": 0.06252056597564988, "grad_norm": 0.4713231325149536, "learning_rate": 0.00018793671720500988, "loss": 2.1232, "step": 190 }, { "epoch": 0.06416584402764067, "grad_norm": 0.5351199507713318, "learning_rate": 0.0001876071193144364, "loss": 2.0956, "step": 195 }, { "epoch": 0.06581112207963145, "grad_norm": 0.380096971988678, "learning_rate": 0.0001872775214238629, "loss": 2.0466, "step": 200 }, { "epoch": 0.06745640013162224, "grad_norm": 0.4392818510532379, "learning_rate": 0.0001869479235332894, "loss": 2.0503, "step": 205 }, { "epoch": 0.06910167818361303, "grad_norm": 0.49540552496910095, "learning_rate": 0.00018661832564271588, "loss": 2.0771, "step": 210 }, { "epoch": 0.07074695623560381, "grad_norm": 0.5129232406616211, "learning_rate": 0.00018628872775214238, "loss": 1.9724, "step": 215 }, { "epoch": 0.0723922342875946, "grad_norm": 0.4697638750076294, "learning_rate": 0.0001859591298615689, "loss": 2.03, "step": 220 }, { "epoch": 0.07403751233958539, "grad_norm": 0.4250948131084442, "learning_rate": 0.0001856295319709954, "loss": 2.0235, "step": 225 }, { "epoch": 0.07568279039157617, "grad_norm": 0.5197622776031494, "learning_rate": 0.0001852999340804219, "loss": 1.991, "step": 230 }, { "epoch": 0.07732806844356696, "grad_norm": 0.45986393094062805, "learning_rate": 0.0001849703361898484, "loss": 2.0191, "step": 235 }, { "epoch": 0.07897334649555775, "grad_norm": 0.5618834495544434, "learning_rate": 0.00018464073829927487, "loss": 1.9859, "step": 240 }, { "epoch": 0.08061862454754853, "grad_norm": 0.5400542616844177, "learning_rate": 0.0001843111404087014, "loss": 1.9633, "step": 245 }, { "epoch": 0.08226390259953932, "grad_norm": 0.5259667038917542, "learning_rate": 0.0001839815425181279, "loss": 1.8768, "step": 250 }, { "epoch": 0.08390918065153011, "grad_norm": 0.5015618801116943, "learning_rate": 0.0001836519446275544, "loss": 2.0329, "step": 255 }, { "epoch": 0.0855544587035209, "grad_norm": 0.4835856854915619, "learning_rate": 0.0001833223467369809, "loss": 1.9815, "step": 260 }, { "epoch": 0.08719973675551168, "grad_norm": 0.4211411774158478, "learning_rate": 0.0001829927488464074, "loss": 1.9899, "step": 265 }, { "epoch": 0.08884501480750247, "grad_norm": 0.4507792294025421, "learning_rate": 0.0001826631509558339, "loss": 1.9865, "step": 270 }, { "epoch": 0.09049029285949325, "grad_norm": 0.5402964353561401, "learning_rate": 0.00018233355306526038, "loss": 1.902, "step": 275 }, { "epoch": 0.09213557091148404, "grad_norm": 0.4574088454246521, "learning_rate": 0.0001820039551746869, "loss": 1.9727, "step": 280 }, { "epoch": 0.09378084896347483, "grad_norm": 0.4615534842014313, "learning_rate": 0.0001816743572841134, "loss": 2.0582, "step": 285 }, { "epoch": 0.09542612701546561, "grad_norm": 0.5126486420631409, "learning_rate": 0.0001813447593935399, "loss": 2.0274, "step": 290 }, { "epoch": 0.0970714050674564, "grad_norm": 0.6757667660713196, "learning_rate": 0.00018101516150296638, "loss": 1.879, "step": 295 }, { "epoch": 0.09871668311944719, "grad_norm": 0.49488508701324463, "learning_rate": 0.00018068556361239288, "loss": 1.9348, "step": 300 }, { "epoch": 0.10036196117143797, "grad_norm": 0.5860428810119629, "learning_rate": 0.0001803559657218194, "loss": 1.9276, "step": 305 }, { "epoch": 0.10200723922342876, "grad_norm": 0.5148414373397827, "learning_rate": 0.0001800263678312459, "loss": 2.0563, "step": 310 }, { "epoch": 0.10365251727541955, "grad_norm": 0.5046892762184143, "learning_rate": 0.00017969676994067238, "loss": 2.0009, "step": 315 }, { "epoch": 0.10529779532741033, "grad_norm": 0.4465779960155487, "learning_rate": 0.0001793671720500989, "loss": 1.9842, "step": 320 }, { "epoch": 0.10694307337940112, "grad_norm": 0.4488319158554077, "learning_rate": 0.00017903757415952537, "loss": 1.9881, "step": 325 }, { "epoch": 0.10858835143139191, "grad_norm": 0.46680882573127747, "learning_rate": 0.0001787079762689519, "loss": 2.0471, "step": 330 }, { "epoch": 0.1102336294833827, "grad_norm": 0.5483986139297485, "learning_rate": 0.00017837837837837839, "loss": 2.0581, "step": 335 }, { "epoch": 0.11187890753537348, "grad_norm": 0.4938408434391022, "learning_rate": 0.00017804878048780488, "loss": 1.9712, "step": 340 }, { "epoch": 0.11352418558736427, "grad_norm": 0.4176371991634369, "learning_rate": 0.0001777191825972314, "loss": 1.9019, "step": 345 }, { "epoch": 0.11516946363935505, "grad_norm": 0.45936137437820435, "learning_rate": 0.0001773895847066579, "loss": 1.8773, "step": 350 }, { "epoch": 0.11681474169134584, "grad_norm": 0.5166374444961548, "learning_rate": 0.0001770599868160844, "loss": 2.0059, "step": 355 }, { "epoch": 0.11846001974333663, "grad_norm": 0.5485665202140808, "learning_rate": 0.00017673038892551088, "loss": 1.9634, "step": 360 }, { "epoch": 0.12010529779532741, "grad_norm": 0.44683098793029785, "learning_rate": 0.00017640079103493737, "loss": 1.9239, "step": 365 }, { "epoch": 0.1217505758473182, "grad_norm": 0.4426558315753937, "learning_rate": 0.0001760711931443639, "loss": 1.9467, "step": 370 }, { "epoch": 0.12339585389930899, "grad_norm": 0.45059794187545776, "learning_rate": 0.0001757415952537904, "loss": 1.9099, "step": 375 }, { "epoch": 0.12504113195129976, "grad_norm": 0.47326767444610596, "learning_rate": 0.00017541199736321688, "loss": 1.9521, "step": 380 }, { "epoch": 0.12668641000329056, "grad_norm": 0.4886496961116791, "learning_rate": 0.00017508239947264337, "loss": 1.9336, "step": 385 }, { "epoch": 0.12833168805528133, "grad_norm": 0.4394533336162567, "learning_rate": 0.00017475280158206987, "loss": 1.8583, "step": 390 }, { "epoch": 0.12997696610727213, "grad_norm": 0.5217518210411072, "learning_rate": 0.0001744232036914964, "loss": 1.9916, "step": 395 }, { "epoch": 0.1316222441592629, "grad_norm": 0.44888633489608765, "learning_rate": 0.00017409360580092288, "loss": 2.0022, "step": 400 }, { "epoch": 0.1332675222112537, "grad_norm": 0.5385366678237915, "learning_rate": 0.0001737640079103494, "loss": 1.9492, "step": 405 }, { "epoch": 0.13491280026324448, "grad_norm": 0.4314708113670349, "learning_rate": 0.00017343441001977587, "loss": 2.0088, "step": 410 }, { "epoch": 0.13655807831523528, "grad_norm": 0.4006335735321045, "learning_rate": 0.00017310481212920236, "loss": 1.9524, "step": 415 }, { "epoch": 0.13820335636722605, "grad_norm": 0.5291544198989868, "learning_rate": 0.00017277521423862888, "loss": 1.8593, "step": 420 }, { "epoch": 0.13984863441921686, "grad_norm": 0.47129592299461365, "learning_rate": 0.00017244561634805538, "loss": 1.9542, "step": 425 }, { "epoch": 0.14149391247120763, "grad_norm": 0.449595183134079, "learning_rate": 0.0001721160184574819, "loss": 1.9294, "step": 430 }, { "epoch": 0.14313919052319843, "grad_norm": 0.4410437047481537, "learning_rate": 0.0001717864205669084, "loss": 1.9209, "step": 435 }, { "epoch": 0.1447844685751892, "grad_norm": 0.5655415654182434, "learning_rate": 0.00017145682267633488, "loss": 1.9486, "step": 440 }, { "epoch": 0.14642974662718, "grad_norm": 0.5219452381134033, "learning_rate": 0.00017112722478576138, "loss": 1.8922, "step": 445 }, { "epoch": 0.14807502467917077, "grad_norm": 0.49918806552886963, "learning_rate": 0.00017079762689518787, "loss": 1.92, "step": 450 }, { "epoch": 0.14972030273116158, "grad_norm": 0.4410875141620636, "learning_rate": 0.0001704680290046144, "loss": 1.9363, "step": 455 }, { "epoch": 0.15136558078315235, "grad_norm": 0.4709133207798004, "learning_rate": 0.00017013843111404089, "loss": 1.8803, "step": 460 }, { "epoch": 0.15301085883514315, "grad_norm": 0.447390079498291, "learning_rate": 0.00016980883322346738, "loss": 2.0388, "step": 465 }, { "epoch": 0.15465613688713392, "grad_norm": 0.4439023435115814, "learning_rate": 0.00016947923533289387, "loss": 1.9112, "step": 470 }, { "epoch": 0.15630141493912472, "grad_norm": 0.5134996175765991, "learning_rate": 0.00016914963744232037, "loss": 1.9556, "step": 475 }, { "epoch": 0.1579466929911155, "grad_norm": 0.5412283539772034, "learning_rate": 0.0001688200395517469, "loss": 1.899, "step": 480 }, { "epoch": 0.1595919710431063, "grad_norm": 0.46328097581863403, "learning_rate": 0.00016849044166117338, "loss": 1.8476, "step": 485 }, { "epoch": 0.16123724909509707, "grad_norm": 0.43716996908187866, "learning_rate": 0.00016816084377059987, "loss": 1.9059, "step": 490 }, { "epoch": 0.16288252714708787, "grad_norm": 0.4769724905490875, "learning_rate": 0.00016783124588002637, "loss": 1.9186, "step": 495 }, { "epoch": 0.16452780519907864, "grad_norm": 0.5047943592071533, "learning_rate": 0.00016750164798945286, "loss": 1.9161, "step": 500 }, { "epoch": 0.16617308325106944, "grad_norm": 0.4556055963039398, "learning_rate": 0.00016717205009887938, "loss": 1.8834, "step": 505 }, { "epoch": 0.16781836130306022, "grad_norm": 0.4692705571651459, "learning_rate": 0.00016684245220830588, "loss": 1.9091, "step": 510 }, { "epoch": 0.16946363935505102, "grad_norm": 0.43482905626296997, "learning_rate": 0.00016651285431773237, "loss": 1.9405, "step": 515 }, { "epoch": 0.1711089174070418, "grad_norm": 0.5708907246589661, "learning_rate": 0.0001661832564271589, "loss": 1.9027, "step": 520 }, { "epoch": 0.1727541954590326, "grad_norm": 0.49181491136550903, "learning_rate": 0.00016585365853658536, "loss": 1.8526, "step": 525 }, { "epoch": 0.17439947351102336, "grad_norm": 0.5000940561294556, "learning_rate": 0.00016552406064601188, "loss": 1.8588, "step": 530 }, { "epoch": 0.17604475156301416, "grad_norm": 0.45289117097854614, "learning_rate": 0.00016519446275543837, "loss": 2.0358, "step": 535 }, { "epoch": 0.17769002961500494, "grad_norm": 0.5227617621421814, "learning_rate": 0.00016486486486486486, "loss": 1.8512, "step": 540 }, { "epoch": 0.17933530766699574, "grad_norm": 0.512492299079895, "learning_rate": 0.00016453526697429138, "loss": 1.9014, "step": 545 }, { "epoch": 0.1809805857189865, "grad_norm": 0.4832890033721924, "learning_rate": 0.00016420566908371785, "loss": 1.9765, "step": 550 }, { "epoch": 0.18262586377097728, "grad_norm": 0.4797350764274597, "learning_rate": 0.00016387607119314437, "loss": 1.8907, "step": 555 }, { "epoch": 0.18427114182296808, "grad_norm": 0.48889973759651184, "learning_rate": 0.00016354647330257086, "loss": 1.9369, "step": 560 }, { "epoch": 0.18591641987495885, "grad_norm": 0.44188860058784485, "learning_rate": 0.00016321687541199739, "loss": 1.8547, "step": 565 }, { "epoch": 0.18756169792694966, "grad_norm": 0.5300605893135071, "learning_rate": 0.00016288727752142388, "loss": 1.8603, "step": 570 }, { "epoch": 0.18920697597894043, "grad_norm": 0.5411728620529175, "learning_rate": 0.00016255767963085037, "loss": 1.8067, "step": 575 }, { "epoch": 0.19085225403093123, "grad_norm": 0.5871673226356506, "learning_rate": 0.00016222808174027687, "loss": 1.8875, "step": 580 }, { "epoch": 0.192497532082922, "grad_norm": 0.4898117482662201, "learning_rate": 0.00016189848384970336, "loss": 1.9148, "step": 585 }, { "epoch": 0.1941428101349128, "grad_norm": 0.4848552346229553, "learning_rate": 0.00016156888595912988, "loss": 1.8179, "step": 590 }, { "epoch": 0.19578808818690357, "grad_norm": 0.49930569529533386, "learning_rate": 0.00016123928806855637, "loss": 1.9306, "step": 595 }, { "epoch": 0.19743336623889438, "grad_norm": 0.47816231846809387, "learning_rate": 0.00016090969017798287, "loss": 1.8287, "step": 600 }, { "epoch": 0.19907864429088515, "grad_norm": 0.5759291052818298, "learning_rate": 0.0001605800922874094, "loss": 1.874, "step": 605 }, { "epoch": 0.20072392234287595, "grad_norm": 0.5100725889205933, "learning_rate": 0.00016025049439683585, "loss": 1.8369, "step": 610 }, { "epoch": 0.20236920039486672, "grad_norm": 0.5101613998413086, "learning_rate": 0.00015992089650626237, "loss": 1.9083, "step": 615 }, { "epoch": 0.20401447844685752, "grad_norm": 0.4661300778388977, "learning_rate": 0.00015959129861568887, "loss": 1.9463, "step": 620 }, { "epoch": 0.2056597564988483, "grad_norm": 0.4888961613178253, "learning_rate": 0.00015926170072511536, "loss": 1.7347, "step": 625 }, { "epoch": 0.2073050345508391, "grad_norm": 0.535188615322113, "learning_rate": 0.00015893210283454188, "loss": 1.9484, "step": 630 }, { "epoch": 0.20895031260282987, "grad_norm": 0.5426183938980103, "learning_rate": 0.00015860250494396835, "loss": 1.8787, "step": 635 }, { "epoch": 0.21059559065482067, "grad_norm": 0.5056272745132446, "learning_rate": 0.00015827290705339487, "loss": 1.8434, "step": 640 }, { "epoch": 0.21224086870681144, "grad_norm": 0.47972238063812256, "learning_rate": 0.00015794330916282136, "loss": 1.8703, "step": 645 }, { "epoch": 0.21388614675880224, "grad_norm": 0.517246425151825, "learning_rate": 0.00015761371127224786, "loss": 1.8613, "step": 650 }, { "epoch": 0.21553142481079302, "grad_norm": 0.4869682490825653, "learning_rate": 0.00015728411338167438, "loss": 1.9388, "step": 655 }, { "epoch": 0.21717670286278382, "grad_norm": 0.5097903609275818, "learning_rate": 0.00015695451549110087, "loss": 1.8254, "step": 660 }, { "epoch": 0.2188219809147746, "grad_norm": 0.5445834398269653, "learning_rate": 0.00015662491760052736, "loss": 1.8341, "step": 665 }, { "epoch": 0.2204672589667654, "grad_norm": 0.4832437336444855, "learning_rate": 0.00015629531970995386, "loss": 1.9317, "step": 670 }, { "epoch": 0.22211253701875616, "grad_norm": 0.45982053875923157, "learning_rate": 0.00015596572181938035, "loss": 1.8398, "step": 675 }, { "epoch": 0.22375781507074696, "grad_norm": 0.5181317329406738, "learning_rate": 0.00015563612392880687, "loss": 1.776, "step": 680 }, { "epoch": 0.22540309312273774, "grad_norm": 0.5491596460342407, "learning_rate": 0.00015530652603823337, "loss": 1.7015, "step": 685 }, { "epoch": 0.22704837117472854, "grad_norm": 0.588294267654419, "learning_rate": 0.00015497692814765986, "loss": 1.8256, "step": 690 }, { "epoch": 0.2286936492267193, "grad_norm": 0.5512030720710754, "learning_rate": 0.00015464733025708635, "loss": 1.7993, "step": 695 }, { "epoch": 0.2303389272787101, "grad_norm": 0.5528048276901245, "learning_rate": 0.00015431773236651285, "loss": 1.8213, "step": 700 }, { "epoch": 0.23198420533070088, "grad_norm": 0.5295950174331665, "learning_rate": 0.00015398813447593937, "loss": 1.7401, "step": 705 }, { "epoch": 0.23362948338269168, "grad_norm": 0.4853232204914093, "learning_rate": 0.00015365853658536586, "loss": 1.8653, "step": 710 }, { "epoch": 0.23527476143468246, "grad_norm": 0.4380806088447571, "learning_rate": 0.00015332893869479238, "loss": 1.7843, "step": 715 }, { "epoch": 0.23692003948667326, "grad_norm": 0.5240153074264526, "learning_rate": 0.00015299934080421887, "loss": 1.7548, "step": 720 }, { "epoch": 0.23856531753866403, "grad_norm": 0.5147051811218262, "learning_rate": 0.00015266974291364534, "loss": 1.8582, "step": 725 }, { "epoch": 0.24021059559065483, "grad_norm": 0.5330904126167297, "learning_rate": 0.00015234014502307186, "loss": 1.737, "step": 730 }, { "epoch": 0.2418558736426456, "grad_norm": 0.5441175699234009, "learning_rate": 0.00015201054713249835, "loss": 1.7784, "step": 735 }, { "epoch": 0.2435011516946364, "grad_norm": 0.5241209268569946, "learning_rate": 0.00015168094924192488, "loss": 1.8404, "step": 740 }, { "epoch": 0.24514642974662718, "grad_norm": 0.5611084699630737, "learning_rate": 0.00015135135135135137, "loss": 1.7501, "step": 745 }, { "epoch": 0.24679170779861798, "grad_norm": 0.5845035314559937, "learning_rate": 0.00015102175346077784, "loss": 1.7205, "step": 750 }, { "epoch": 0.24843698585060875, "grad_norm": 0.5952686667442322, "learning_rate": 0.00015069215557020436, "loss": 1.8205, "step": 755 }, { "epoch": 0.2500822639025995, "grad_norm": 0.652553379535675, "learning_rate": 0.00015036255767963085, "loss": 1.8133, "step": 760 }, { "epoch": 0.25172754195459035, "grad_norm": 0.5319493412971497, "learning_rate": 0.00015003295978905737, "loss": 1.7484, "step": 765 }, { "epoch": 0.2533728200065811, "grad_norm": 0.5144583582878113, "learning_rate": 0.00014970336189848386, "loss": 1.825, "step": 770 }, { "epoch": 0.2550180980585719, "grad_norm": 0.5155309438705444, "learning_rate": 0.00014937376400791036, "loss": 1.7823, "step": 775 }, { "epoch": 0.25666337611056267, "grad_norm": 0.5156668424606323, "learning_rate": 0.00014904416611733685, "loss": 1.8167, "step": 780 }, { "epoch": 0.2583086541625535, "grad_norm": 0.5913170576095581, "learning_rate": 0.00014871456822676334, "loss": 1.7961, "step": 785 }, { "epoch": 0.25995393221454427, "grad_norm": 0.5405130982398987, "learning_rate": 0.00014838497033618986, "loss": 1.6679, "step": 790 }, { "epoch": 0.26159921026653504, "grad_norm": 0.5765439867973328, "learning_rate": 0.00014805537244561636, "loss": 1.7794, "step": 795 }, { "epoch": 0.2632444883185258, "grad_norm": 0.5708970427513123, "learning_rate": 0.00014772577455504285, "loss": 1.8888, "step": 800 } ], "logging_steps": 5, "max_steps": 3039, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6307808646864896e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }