| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.995397008055236, |
| "eval_steps": 500, |
| "global_step": 868, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.023014959723820484, |
| "grad_norm": 25.436981238085195, |
| "learning_rate": 5.747126436781609e-08, |
| "loss": 0.6046, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04602991944764097, |
| "grad_norm": 20.828982286160393, |
| "learning_rate": 1.1494252873563217e-07, |
| "loss": 0.5684, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06904487917146145, |
| "grad_norm": 18.79182460742335, |
| "learning_rate": 1.7241379310344828e-07, |
| "loss": 0.5887, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.09205983889528194, |
| "grad_norm": 17.689641482327932, |
| "learning_rate": 2.2988505747126435e-07, |
| "loss": 0.6554, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11507479861910241, |
| "grad_norm": 16.065937870878834, |
| "learning_rate": 2.873563218390804e-07, |
| "loss": 0.5661, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.1380897583429229, |
| "grad_norm": 16.348847664761777, |
| "learning_rate": 3.4482758620689656e-07, |
| "loss": 0.6168, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1611047180667434, |
| "grad_norm": 17.719848872005795, |
| "learning_rate": 4.0229885057471266e-07, |
| "loss": 0.5178, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.18411967779056387, |
| "grad_norm": 15.539337894638383, |
| "learning_rate": 4.597701149425287e-07, |
| "loss": 0.5083, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.20713463751438435, |
| "grad_norm": 12.464237258707364, |
| "learning_rate": 5.172413793103448e-07, |
| "loss": 0.5685, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.23014959723820483, |
| "grad_norm": 23.062114966571542, |
| "learning_rate": 5.747126436781608e-07, |
| "loss": 0.5859, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.25316455696202533, |
| "grad_norm": 12.808034602784485, |
| "learning_rate": 6.32183908045977e-07, |
| "loss": 0.5568, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.2761795166858458, |
| "grad_norm": 15.176748325531914, |
| "learning_rate": 6.896551724137931e-07, |
| "loss": 0.4969, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2991944764096663, |
| "grad_norm": 18.200134879195883, |
| "learning_rate": 7.471264367816092e-07, |
| "loss": 0.5486, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.3222094361334868, |
| "grad_norm": 16.483146212519067, |
| "learning_rate": 8.045977011494253e-07, |
| "loss": 0.5261, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.34522439585730724, |
| "grad_norm": 9.46076044473471, |
| "learning_rate": 8.620689655172412e-07, |
| "loss": 0.4849, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.36823935558112775, |
| "grad_norm": 17.374061304465616, |
| "learning_rate": 9.195402298850574e-07, |
| "loss": 0.5233, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3912543153049482, |
| "grad_norm": 15.210933453312531, |
| "learning_rate": 9.770114942528735e-07, |
| "loss": 0.5727, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4142692750287687, |
| "grad_norm": 12.981120100801451, |
| "learning_rate": 9.999635938356475e-07, |
| "loss": 0.5257, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4372842347525892, |
| "grad_norm": 17.675647072729685, |
| "learning_rate": 9.997411309192068e-07, |
| "loss": 0.5459, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.46029919447640966, |
| "grad_norm": 14.258404245690016, |
| "learning_rate": 9.993165206112856e-07, |
| "loss": 0.526, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.48331415420023016, |
| "grad_norm": 14.135034352681272, |
| "learning_rate": 9.98689934668541e-07, |
| "loss": 0.537, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 14.441236134574162, |
| "learning_rate": 9.978616265476252e-07, |
| "loss": 0.5772, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5293440736478712, |
| "grad_norm": 18.566704225296032, |
| "learning_rate": 9.9683193130266e-07, |
| "loss": 0.5539, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.5523590333716916, |
| "grad_norm": 20.35165674585742, |
| "learning_rate": 9.956012654497072e-07, |
| "loss": 0.5824, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5753739930955121, |
| "grad_norm": 18.442515510965833, |
| "learning_rate": 9.941701267982862e-07, |
| "loss": 0.5599, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5983889528193326, |
| "grad_norm": 15.875900780315595, |
| "learning_rate": 9.925390942500064e-07, |
| "loss": 0.5304, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6214039125431531, |
| "grad_norm": 15.34860260658814, |
| "learning_rate": 9.907088275644012e-07, |
| "loss": 0.5846, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.6444188722669736, |
| "grad_norm": 21.447791339652113, |
| "learning_rate": 9.8868006709205e-07, |
| "loss": 0.5849, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.667433831990794, |
| "grad_norm": 10.808057687362574, |
| "learning_rate": 9.864536334751061e-07, |
| "loss": 0.4918, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6904487917146145, |
| "grad_norm": 12.425855367187637, |
| "learning_rate": 9.84030427315341e-07, |
| "loss": 0.5746, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.713463751438435, |
| "grad_norm": 21.87717141819886, |
| "learning_rate": 9.814114288098486e-07, |
| "loss": 0.5923, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.7364787111622555, |
| "grad_norm": 14.90837695531576, |
| "learning_rate": 9.78597697354551e-07, |
| "loss": 0.5658, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.759493670886076, |
| "grad_norm": 13.422204897299519, |
| "learning_rate": 9.755903711156684e-07, |
| "loss": 0.5876, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.7825086306098964, |
| "grad_norm": 13.807384773589732, |
| "learning_rate": 9.723906665693258e-07, |
| "loss": 0.6197, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.8055235903337169, |
| "grad_norm": 17.890041232308786, |
| "learning_rate": 9.689998780094837e-07, |
| "loss": 0.6169, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.8285385500575374, |
| "grad_norm": 12.171428212644345, |
| "learning_rate": 9.654193770243905e-07, |
| "loss": 0.5251, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.8515535097813579, |
| "grad_norm": 14.00509518878877, |
| "learning_rate": 9.616506119417697e-07, |
| "loss": 0.5351, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.8745684695051784, |
| "grad_norm": 13.855656981961861, |
| "learning_rate": 9.576951072429643e-07, |
| "loss": 0.4762, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8975834292289988, |
| "grad_norm": 12.530836070851475, |
| "learning_rate": 9.535544629462786e-07, |
| "loss": 0.5654, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.9205983889528193, |
| "grad_norm": 14.94597069900537, |
| "learning_rate": 9.492303539597636e-07, |
| "loss": 0.5434, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9436133486766398, |
| "grad_norm": 12.929882120022377, |
| "learning_rate": 9.4472452940371e-07, |
| "loss": 0.5511, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.9666283084004603, |
| "grad_norm": 11.785693957022563, |
| "learning_rate": 9.400388119031211e-07, |
| "loss": 0.5743, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9896432681242808, |
| "grad_norm": 11.4673892135634, |
| "learning_rate": 9.351750968504539e-07, |
| "loss": 0.5289, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.998849252013809, |
| "eval_loss": 0.5940150022506714, |
| "eval_runtime": 19.3038, |
| "eval_samples_per_second": 7.356, |
| "eval_steps_per_second": 0.932, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.0126582278481013, |
| "grad_norm": 9.979009329427443, |
| "learning_rate": 9.301353516389246e-07, |
| "loss": 0.493, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.0356731875719218, |
| "grad_norm": 9.627235780909444, |
| "learning_rate": 9.249216148666895e-07, |
| "loss": 0.4392, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.0586881472957423, |
| "grad_norm": 10.663799414852592, |
| "learning_rate": 9.195359955122243e-07, |
| "loss": 0.4452, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.0817031070195626, |
| "grad_norm": 9.923849389538619, |
| "learning_rate": 9.139806720812324e-07, |
| "loss": 0.4312, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.1047180667433831, |
| "grad_norm": 13.08615355461209, |
| "learning_rate": 9.082578917254309e-07, |
| "loss": 0.4213, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.1277330264672036, |
| "grad_norm": 14.694279179845232, |
| "learning_rate": 9.023699693335678e-07, |
| "loss": 0.4362, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.1507479861910241, |
| "grad_norm": 13.373390196527911, |
| "learning_rate": 8.963192865950402e-07, |
| "loss": 0.4383, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.1737629459148446, |
| "grad_norm": 15.92838239960798, |
| "learning_rate": 8.901082910364906e-07, |
| "loss": 0.4091, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.1967779056386652, |
| "grad_norm": 11.132023780497018, |
| "learning_rate": 8.83739495031772e-07, |
| "loss": 0.4542, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.2197928653624857, |
| "grad_norm": 12.973494941543084, |
| "learning_rate": 8.772154747856825e-07, |
| "loss": 0.464, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.2428078250863062, |
| "grad_norm": 10.383261084427103, |
| "learning_rate": 8.705388692918792e-07, |
| "loss": 0.4561, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.2658227848101267, |
| "grad_norm": 12.748442378817272, |
| "learning_rate": 8.637123792653946e-07, |
| "loss": 0.4947, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.288837744533947, |
| "grad_norm": 13.222343698654353, |
| "learning_rate": 8.567387660501851e-07, |
| "loss": 0.468, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.3118527042577677, |
| "grad_norm": 13.75340991399124, |
| "learning_rate": 8.49620850502157e-07, |
| "loss": 0.4949, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.334867663981588, |
| "grad_norm": 13.111144430939557, |
| "learning_rate": 8.423615118481175e-07, |
| "loss": 0.4303, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.3578826237054085, |
| "grad_norm": 12.913854941381981, |
| "learning_rate": 8.349636865211156e-07, |
| "loss": 0.4768, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.380897583429229, |
| "grad_norm": 11.088379869831822, |
| "learning_rate": 8.274303669726426e-07, |
| "loss": 0.3887, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.4039125431530495, |
| "grad_norm": 14.136934996189732, |
| "learning_rate": 8.197646004621728e-07, |
| "loss": 0.4006, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.42692750287687, |
| "grad_norm": 13.472677332352156, |
| "learning_rate": 8.119694878245342e-07, |
| "loss": 0.4591, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.4499424626006905, |
| "grad_norm": 16.812648158386022, |
| "learning_rate": 8.040481822156082e-07, |
| "loss": 0.4317, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.472957422324511, |
| "grad_norm": 12.431554542835052, |
| "learning_rate": 7.96003887836864e-07, |
| "loss": 0.495, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.4959723820483315, |
| "grad_norm": 11.033195564551397, |
| "learning_rate": 7.878398586392461e-07, |
| "loss": 0.4493, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.518987341772152, |
| "grad_norm": 14.98570304225286, |
| "learning_rate": 7.795593970069373e-07, |
| "loss": 0.423, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.5420023014959723, |
| "grad_norm": 15.240270393540726, |
| "learning_rate": 7.711658524215305e-07, |
| "loss": 0.427, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.565017261219793, |
| "grad_norm": 13.308584149699858, |
| "learning_rate": 7.626626201071493e-07, |
| "loss": 0.4334, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.5880322209436133, |
| "grad_norm": 11.918078822598936, |
| "learning_rate": 7.540531396570655e-07, |
| "loss": 0.4526, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.6110471806674338, |
| "grad_norm": 16.581675862528876, |
| "learning_rate": 7.453408936423687e-07, |
| "loss": 0.5097, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.6340621403912543, |
| "grad_norm": 15.37481667798687, |
| "learning_rate": 7.365294062032528e-07, |
| "loss": 0.432, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.6570771001150748, |
| "grad_norm": 14.096321138322955, |
| "learning_rate": 7.27622241623485e-07, |
| "loss": 0.4631, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.6800920598388953, |
| "grad_norm": 11.953450213331397, |
| "learning_rate": 7.18623002888639e-07, |
| "loss": 0.4227, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.7031070195627158, |
| "grad_norm": 15.401901846965378, |
| "learning_rate": 7.095353302286721e-07, |
| "loss": 0.4424, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.7261219792865363, |
| "grad_norm": 12.52624816605781, |
| "learning_rate": 7.003628996454361e-07, |
| "loss": 0.4238, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.7491369390103566, |
| "grad_norm": 13.884043525893127, |
| "learning_rate": 6.911094214257204e-07, |
| "loss": 0.4881, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.7721518987341773, |
| "grad_norm": 12.763240064118046, |
| "learning_rate": 6.817786386404237e-07, |
| "loss": 0.4292, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.7951668584579976, |
| "grad_norm": 14.214477794634833, |
| "learning_rate": 6.723743256304676e-07, |
| "loss": 0.3902, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 17.56505032673697, |
| "learning_rate": 6.629002864800588e-07, |
| "loss": 0.4949, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.8411967779056386, |
| "grad_norm": 13.349528895489893, |
| "learning_rate": 6.533603534779215e-07, |
| "loss": 0.4711, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.8642117376294591, |
| "grad_norm": 12.236389004746698, |
| "learning_rate": 6.437583855671204e-07, |
| "loss": 0.4669, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.8872266973532796, |
| "grad_norm": 15.00019380907836, |
| "learning_rate": 6.340982667841021e-07, |
| "loss": 0.412, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.9102416570771001, |
| "grad_norm": 14.956008054688557, |
| "learning_rate": 6.243839046875853e-07, |
| "loss": 0.4395, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.9332566168009206, |
| "grad_norm": 12.692934607546489, |
| "learning_rate": 6.146192287779377e-07, |
| "loss": 0.4188, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.956271576524741, |
| "grad_norm": 13.129064291695105, |
| "learning_rate": 6.048081889076766e-07, |
| "loss": 0.4306, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.9792865362485617, |
| "grad_norm": 14.905784082915199, |
| "learning_rate": 5.949547536837359e-07, |
| "loss": 0.4472, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.997698504027618, |
| "eval_loss": 0.6037406325340271, |
| "eval_runtime": 19.2686, |
| "eval_samples_per_second": 7.369, |
| "eval_steps_per_second": 0.934, |
| "step": 434 |
| }, |
| { |
| "epoch": 2.002301495972382, |
| "grad_norm": 10.451580707887741, |
| "learning_rate": 5.85062908862149e-07, |
| "loss": 0.4338, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.0253164556962027, |
| "grad_norm": 10.700383234622784, |
| "learning_rate": 5.751366557357933e-07, |
| "loss": 0.3787, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.048331415420023, |
| "grad_norm": 11.08124701052113, |
| "learning_rate": 5.651800095158501e-07, |
| "loss": 0.3198, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.0713463751438437, |
| "grad_norm": 8.676543638522384, |
| "learning_rate": 5.551969977076349e-07, |
| "loss": 0.3124, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.094361334867664, |
| "grad_norm": 11.853074754996486, |
| "learning_rate": 5.451916584814551e-07, |
| "loss": 0.3579, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.1173762945914847, |
| "grad_norm": 16.53049309392707, |
| "learning_rate": 5.351680390391524e-07, |
| "loss": 0.3046, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.140391254315305, |
| "grad_norm": 11.731708343253732, |
| "learning_rate": 5.25130193976993e-07, |
| "loss": 0.3058, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.1634062140391253, |
| "grad_norm": 11.499082702306787, |
| "learning_rate": 5.150821836455659e-07, |
| "loss": 0.3376, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.186421173762946, |
| "grad_norm": 12.34107746817991, |
| "learning_rate": 5.05028072507355e-07, |
| "loss": 0.3237, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.2094361334867663, |
| "grad_norm": 17.370024261115674, |
| "learning_rate": 4.949719274926452e-07, |
| "loss": 0.2975, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.232451093210587, |
| "grad_norm": 9.197724681633677, |
| "learning_rate": 4.84917816354434e-07, |
| "loss": 0.2967, |
| "step": 485 |
| }, |
| { |
| "epoch": 2.2554660529344073, |
| "grad_norm": 12.648733804765213, |
| "learning_rate": 4.748698060230071e-07, |
| "loss": 0.3198, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.278481012658228, |
| "grad_norm": 13.903180310283751, |
| "learning_rate": 4.648319609608476e-07, |
| "loss": 0.3128, |
| "step": 495 |
| }, |
| { |
| "epoch": 2.3014959723820483, |
| "grad_norm": 15.830064172391179, |
| "learning_rate": 4.548083415185448e-07, |
| "loss": 0.311, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.324510932105869, |
| "grad_norm": 14.674255723714507, |
| "learning_rate": 4.4480300229236517e-07, |
| "loss": 0.3207, |
| "step": 505 |
| }, |
| { |
| "epoch": 2.3475258918296893, |
| "grad_norm": 12.895709713108205, |
| "learning_rate": 4.3481999048415e-07, |
| "loss": 0.3417, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.3705408515535096, |
| "grad_norm": 8.929819185065151, |
| "learning_rate": 4.248633442642067e-07, |
| "loss": 0.3177, |
| "step": 515 |
| }, |
| { |
| "epoch": 2.3935558112773303, |
| "grad_norm": 13.644125606933226, |
| "learning_rate": 4.1493709113785087e-07, |
| "loss": 0.3031, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.4165707710011506, |
| "grad_norm": 11.673758922253713, |
| "learning_rate": 4.050452463162642e-07, |
| "loss": 0.3104, |
| "step": 525 |
| }, |
| { |
| "epoch": 2.4395857307249713, |
| "grad_norm": 12.820622173176774, |
| "learning_rate": 3.9519181109232345e-07, |
| "loss": 0.3063, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.4626006904487916, |
| "grad_norm": 18.02707809486906, |
| "learning_rate": 3.853807712220622e-07, |
| "loss": 0.3025, |
| "step": 535 |
| }, |
| { |
| "epoch": 2.4856156501726123, |
| "grad_norm": 11.49239340519658, |
| "learning_rate": 3.756160953124149e-07, |
| "loss": 0.307, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.5086306098964326, |
| "grad_norm": 13.193325188422616, |
| "learning_rate": 3.65901733215898e-07, |
| "loss": 0.317, |
| "step": 545 |
| }, |
| { |
| "epoch": 2.5316455696202533, |
| "grad_norm": 12.852961746212529, |
| "learning_rate": 3.5624161443287947e-07, |
| "loss": 0.3089, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.5546605293440736, |
| "grad_norm": 15.085048095240678, |
| "learning_rate": 3.4663964652207836e-07, |
| "loss": 0.3162, |
| "step": 555 |
| }, |
| { |
| "epoch": 2.577675489067894, |
| "grad_norm": 15.098026200789873, |
| "learning_rate": 3.3709971351994126e-07, |
| "loss": 0.3326, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.6006904487917146, |
| "grad_norm": 13.107735056669913, |
| "learning_rate": 3.276256743695324e-07, |
| "loss": 0.3041, |
| "step": 565 |
| }, |
| { |
| "epoch": 2.6237054085155354, |
| "grad_norm": 13.701942344015102, |
| "learning_rate": 3.1822136135957635e-07, |
| "loss": 0.3054, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.6467203682393556, |
| "grad_norm": 12.49222926227634, |
| "learning_rate": 3.088905785742797e-07, |
| "loss": 0.3032, |
| "step": 575 |
| }, |
| { |
| "epoch": 2.669735327963176, |
| "grad_norm": 19.83941830585977, |
| "learning_rate": 2.9963710035456393e-07, |
| "loss": 0.3066, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.6927502876869966, |
| "grad_norm": 12.875796703207728, |
| "learning_rate": 2.904646697713278e-07, |
| "loss": 0.2887, |
| "step": 585 |
| }, |
| { |
| "epoch": 2.715765247410817, |
| "grad_norm": 13.779139491323368, |
| "learning_rate": 2.813769971113608e-07, |
| "loss": 0.3195, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.7387802071346377, |
| "grad_norm": 15.357477574862349, |
| "learning_rate": 2.72377758376515e-07, |
| "loss": 0.3369, |
| "step": 595 |
| }, |
| { |
| "epoch": 2.761795166858458, |
| "grad_norm": 14.296503558288, |
| "learning_rate": 2.634705937967471e-07, |
| "loss": 0.2943, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.7848101265822782, |
| "grad_norm": 12.061535257713842, |
| "learning_rate": 2.546591063576312e-07, |
| "loss": 0.284, |
| "step": 605 |
| }, |
| { |
| "epoch": 2.807825086306099, |
| "grad_norm": 16.839618374963486, |
| "learning_rate": 2.459468603429345e-07, |
| "loss": 0.3, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.8308400460299197, |
| "grad_norm": 14.189090458195743, |
| "learning_rate": 2.3733737989285068e-07, |
| "loss": 0.302, |
| "step": 615 |
| }, |
| { |
| "epoch": 2.85385500575374, |
| "grad_norm": 13.32887690082132, |
| "learning_rate": 2.2883414757846948e-07, |
| "loss": 0.3137, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.8768699654775602, |
| "grad_norm": 12.574497648587789, |
| "learning_rate": 2.2044060299306267e-07, |
| "loss": 0.2839, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.899884925201381, |
| "grad_norm": 15.048298566531198, |
| "learning_rate": 2.12160141360754e-07, |
| "loss": 0.2973, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.9228998849252013, |
| "grad_norm": 11.951339536247458, |
| "learning_rate": 2.03996112163136e-07, |
| "loss": 0.3097, |
| "step": 635 |
| }, |
| { |
| "epoch": 2.945914844649022, |
| "grad_norm": 15.69565784514616, |
| "learning_rate": 1.9595181778439174e-07, |
| "loss": 0.2962, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.9689298043728423, |
| "grad_norm": 12.999365393476868, |
| "learning_rate": 1.8803051217546584e-07, |
| "loss": 0.3009, |
| "step": 645 |
| }, |
| { |
| "epoch": 2.991944764096663, |
| "grad_norm": 14.935058458987008, |
| "learning_rate": 1.8023539953782735e-07, |
| "loss": 0.2872, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.996547756041427, |
| "eval_loss": 0.6948590278625488, |
| "eval_runtime": 19.2791, |
| "eval_samples_per_second": 7.365, |
| "eval_steps_per_second": 0.934, |
| "step": 651 |
| }, |
| { |
| "epoch": 3.0149597238204833, |
| "grad_norm": 13.075885224885349, |
| "learning_rate": 1.725696330273575e-07, |
| "loss": 0.2512, |
| "step": 655 |
| }, |
| { |
| "epoch": 3.037974683544304, |
| "grad_norm": 9.704290953271537, |
| "learning_rate": 1.6503631347888436e-07, |
| "loss": 0.2256, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.0609896432681243, |
| "grad_norm": 12.84818126176939, |
| "learning_rate": 1.5763848815188258e-07, |
| "loss": 0.2491, |
| "step": 665 |
| }, |
| { |
| "epoch": 3.0840046029919446, |
| "grad_norm": 11.493489134477986, |
| "learning_rate": 1.5037914949784296e-07, |
| "loss": 0.2386, |
| "step": 670 |
| }, |
| { |
| "epoch": 3.1070195627157653, |
| "grad_norm": 12.705781953882676, |
| "learning_rate": 1.432612339498148e-07, |
| "loss": 0.2488, |
| "step": 675 |
| }, |
| { |
| "epoch": 3.1300345224395856, |
| "grad_norm": 13.901431552375566, |
| "learning_rate": 1.3628762073460548e-07, |
| "loss": 0.2305, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.1530494821634063, |
| "grad_norm": 14.715304886124786, |
| "learning_rate": 1.294611307081207e-07, |
| "loss": 0.2386, |
| "step": 685 |
| }, |
| { |
| "epoch": 3.1760644418872266, |
| "grad_norm": 10.222299037925385, |
| "learning_rate": 1.2278452521431744e-07, |
| "loss": 0.2339, |
| "step": 690 |
| }, |
| { |
| "epoch": 3.1990794016110473, |
| "grad_norm": 10.930465595389542, |
| "learning_rate": 1.1626050496822793e-07, |
| "loss": 0.2413, |
| "step": 695 |
| }, |
| { |
| "epoch": 3.2220943613348676, |
| "grad_norm": 12.451641988411806, |
| "learning_rate": 1.0989170896350947e-07, |
| "loss": 0.2284, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.2451093210586883, |
| "grad_norm": 9.137656540405636, |
| "learning_rate": 1.0368071340495976e-07, |
| "loss": 0.2395, |
| "step": 705 |
| }, |
| { |
| "epoch": 3.2681242807825086, |
| "grad_norm": 11.671230840455808, |
| "learning_rate": 9.76300306664321e-08, |
| "loss": 0.253, |
| "step": 710 |
| }, |
| { |
| "epoch": 3.291139240506329, |
| "grad_norm": 17.933280474807955, |
| "learning_rate": 9.174210827456913e-08, |
| "loss": 0.2585, |
| "step": 715 |
| }, |
| { |
| "epoch": 3.3141542002301496, |
| "grad_norm": 12.982890796403161, |
| "learning_rate": 8.601932791876754e-08, |
| "loss": 0.2269, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.33716915995397, |
| "grad_norm": 8.384602246690656, |
| "learning_rate": 8.046400448777574e-08, |
| "loss": 0.2253, |
| "step": 725 |
| }, |
| { |
| "epoch": 3.3601841196777906, |
| "grad_norm": 10.342720782810092, |
| "learning_rate": 7.507838513331049e-08, |
| "loss": 0.233, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.383199079401611, |
| "grad_norm": 12.679510689599434, |
| "learning_rate": 6.986464836107547e-08, |
| "loss": 0.2433, |
| "step": 735 |
| }, |
| { |
| "epoch": 3.4062140391254316, |
| "grad_norm": 10.38040904275821, |
| "learning_rate": 6.48249031495462e-08, |
| "loss": 0.2071, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.429228998849252, |
| "grad_norm": 12.061157277304762, |
| "learning_rate": 5.996118809687894e-08, |
| "loss": 0.2537, |
| "step": 745 |
| }, |
| { |
| "epoch": 3.4522439585730726, |
| "grad_norm": 14.129792097128606, |
| "learning_rate": 5.527547059629012e-08, |
| "loss": 0.2384, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.475258918296893, |
| "grad_norm": 12.922540776147939, |
| "learning_rate": 5.0769646040236424e-08, |
| "loss": 0.2387, |
| "step": 755 |
| }, |
| { |
| "epoch": 3.4982738780207137, |
| "grad_norm": 11.979253675702983, |
| "learning_rate": 4.6445537053721396e-08, |
| "loss": 0.2294, |
| "step": 760 |
| }, |
| { |
| "epoch": 3.521288837744534, |
| "grad_norm": 11.457518174839912, |
| "learning_rate": 4.2304892757035636e-08, |
| "loss": 0.2264, |
| "step": 765 |
| }, |
| { |
| "epoch": 3.5443037974683547, |
| "grad_norm": 18.542321221268033, |
| "learning_rate": 3.834938805823029e-08, |
| "loss": 0.2424, |
| "step": 770 |
| }, |
| { |
| "epoch": 3.567318757192175, |
| "grad_norm": 13.26259377110348, |
| "learning_rate": 3.4580622975609375e-08, |
| "loss": 0.2435, |
| "step": 775 |
| }, |
| { |
| "epoch": 3.5903337169159952, |
| "grad_norm": 11.283666064750703, |
| "learning_rate": 3.100012199051627e-08, |
| "loss": 0.2321, |
| "step": 780 |
| }, |
| { |
| "epoch": 3.613348676639816, |
| "grad_norm": 33.16980829715486, |
| "learning_rate": 2.7609333430674232e-08, |
| "loss": 0.2396, |
| "step": 785 |
| }, |
| { |
| "epoch": 3.6363636363636362, |
| "grad_norm": 15.962685581367262, |
| "learning_rate": 2.4409628884331625e-08, |
| "loss": 0.2303, |
| "step": 790 |
| }, |
| { |
| "epoch": 3.659378596087457, |
| "grad_norm": 12.558852196507443, |
| "learning_rate": 2.14023026454489e-08, |
| "loss": 0.2455, |
| "step": 795 |
| }, |
| { |
| "epoch": 3.6823935558112773, |
| "grad_norm": 13.379534877412999, |
| "learning_rate": 1.8588571190151337e-08, |
| "loss": 0.2358, |
| "step": 800 |
| }, |
| { |
| "epoch": 3.705408515535098, |
| "grad_norm": 10.55910332644343, |
| "learning_rate": 1.5969572684658984e-08, |
| "loss": 0.2253, |
| "step": 805 |
| }, |
| { |
| "epoch": 3.7284234752589183, |
| "grad_norm": 14.737759161492995, |
| "learning_rate": 1.3546366524893827e-08, |
| "loss": 0.25, |
| "step": 810 |
| }, |
| { |
| "epoch": 3.751438434982739, |
| "grad_norm": 11.622900035948389, |
| "learning_rate": 1.1319932907949859e-08, |
| "loss": 0.2408, |
| "step": 815 |
| }, |
| { |
| "epoch": 3.7744533947065593, |
| "grad_norm": 12.11809870039906, |
| "learning_rate": 9.291172435598904e-09, |
| "loss": 0.2441, |
| "step": 820 |
| }, |
| { |
| "epoch": 3.7974683544303796, |
| "grad_norm": 18.918951484361397, |
| "learning_rate": 7.460905749993474e-09, |
| "loss": 0.2338, |
| "step": 825 |
| }, |
| { |
| "epoch": 3.8204833141542003, |
| "grad_norm": 14.793042348598215, |
| "learning_rate": 5.8298732017137975e-09, |
| "loss": 0.2405, |
| "step": 830 |
| }, |
| { |
| "epoch": 3.8434982738780206, |
| "grad_norm": 12.864105686917561, |
| "learning_rate": 4.398734550292715e-09, |
| "loss": 0.2329, |
| "step": 835 |
| }, |
| { |
| "epoch": 3.8665132336018413, |
| "grad_norm": 14.835178736224917, |
| "learning_rate": 3.168068697340043e-09, |
| "loss": 0.2528, |
| "step": 840 |
| }, |
| { |
| "epoch": 3.8895281933256616, |
| "grad_norm": 9.895215072689167, |
| "learning_rate": 2.1383734523748308e-09, |
| "loss": 0.2318, |
| "step": 845 |
| }, |
| { |
| "epoch": 3.9125431530494823, |
| "grad_norm": 11.324908576826193, |
| "learning_rate": 1.3100653314587761e-09, |
| "loss": 0.2492, |
| "step": 850 |
| }, |
| { |
| "epoch": 3.9355581127733026, |
| "grad_norm": 10.840567294920715, |
| "learning_rate": 6.834793887142143e-10, |
| "loss": 0.2295, |
| "step": 855 |
| }, |
| { |
| "epoch": 3.9585730724971233, |
| "grad_norm": 9.775718572541846, |
| "learning_rate": 2.5886908079308934e-10, |
| "loss": 0.2134, |
| "step": 860 |
| }, |
| { |
| "epoch": 3.9815880322209436, |
| "grad_norm": 12.104927788108858, |
| "learning_rate": 3.640616435257593e-11, |
| "loss": 0.2589, |
| "step": 865 |
| }, |
| { |
| "epoch": 3.995397008055236, |
| "eval_loss": 0.7718632221221924, |
| "eval_runtime": 19.3388, |
| "eval_samples_per_second": 7.343, |
| "eval_steps_per_second": 0.931, |
| "step": 868 |
| }, |
| { |
| "epoch": 3.995397008055236, |
| "step": 868, |
| "total_flos": 29536675078144.0, |
| "train_loss": 0.387863024725892, |
| "train_runtime": 11116.6413, |
| "train_samples_per_second": 2.501, |
| "train_steps_per_second": 0.078 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 868, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 29536675078144.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|