| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 27.069486404833835, | |
| "eval_steps": 100, | |
| "global_step": 4200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06445115810674723, | |
| "grad_norm": 8.751441955566406, | |
| "learning_rate": 5.154639175257732e-07, | |
| "loss": 8.7908, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12890231621349446, | |
| "grad_norm": 5.980831146240234, | |
| "learning_rate": 1.0309278350515464e-06, | |
| "loss": 8.4297, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1933534743202417, | |
| "grad_norm": 3.555551528930664, | |
| "learning_rate": 1.5463917525773197e-06, | |
| "loss": 7.9847, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2578046324269889, | |
| "grad_norm": 2.1481900215148926, | |
| "learning_rate": 2.061855670103093e-06, | |
| "loss": 7.5452, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.32225579053373615, | |
| "grad_norm": 1.3654990196228027, | |
| "learning_rate": 2.577319587628866e-06, | |
| "loss": 7.1644, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3867069486404834, | |
| "grad_norm": 1.294124960899353, | |
| "learning_rate": 3.0927835051546395e-06, | |
| "loss": 6.8552, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4511581067472306, | |
| "grad_norm": 1.3459229469299316, | |
| "learning_rate": 3.6082474226804126e-06, | |
| "loss": 6.5137, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5156092648539778, | |
| "grad_norm": 1.345466136932373, | |
| "learning_rate": 4.123711340206186e-06, | |
| "loss": 6.2217, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5800604229607251, | |
| "grad_norm": 1.0641965866088867, | |
| "learning_rate": 4.639175257731959e-06, | |
| "loss": 5.999, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6445115810674723, | |
| "grad_norm": 1.5514322519302368, | |
| "learning_rate": 5.154639175257732e-06, | |
| "loss": 5.8344, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6445115810674723, | |
| "eval_loss": 5.791144847869873, | |
| "eval_runtime": 20.2064, | |
| "eval_samples_per_second": 79.43, | |
| "eval_steps_per_second": 4.998, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7089627391742196, | |
| "grad_norm": 9.654533386230469, | |
| "learning_rate": 5.670103092783505e-06, | |
| "loss": 5.7513, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7734138972809668, | |
| "grad_norm": 2.767559289932251, | |
| "learning_rate": 6.185567010309279e-06, | |
| "loss": 5.6479, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.837865055387714, | |
| "grad_norm": 1.2786972522735596, | |
| "learning_rate": 6.701030927835052e-06, | |
| "loss": 5.5542, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9023162134944612, | |
| "grad_norm": 1.3562631607055664, | |
| "learning_rate": 7.216494845360825e-06, | |
| "loss": 5.4406, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9667673716012085, | |
| "grad_norm": 3.51896333694458, | |
| "learning_rate": 7.731958762886599e-06, | |
| "loss": 5.323, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0312185297079557, | |
| "grad_norm": 1.808701753616333, | |
| "learning_rate": 8.247422680412371e-06, | |
| "loss": 5.6701, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.095669687814703, | |
| "grad_norm": 3.2831215858459473, | |
| "learning_rate": 8.762886597938146e-06, | |
| "loss": 5.1268, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1601208459214503, | |
| "grad_norm": 2.606133460998535, | |
| "learning_rate": 9.278350515463918e-06, | |
| "loss": 5.0758, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2245720040281973, | |
| "grad_norm": 1.7396525144577026, | |
| "learning_rate": 9.793814432989691e-06, | |
| "loss": 5.0134, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2890231621349446, | |
| "grad_norm": 0.7748392820358276, | |
| "learning_rate": 1.0309278350515464e-05, | |
| "loss": 4.9703, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2890231621349446, | |
| "eval_loss": 4.946072578430176, | |
| "eval_runtime": 19.5278, | |
| "eval_samples_per_second": 82.19, | |
| "eval_steps_per_second": 5.172, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.353474320241692, | |
| "grad_norm": 2.377145290374756, | |
| "learning_rate": 1.0824742268041238e-05, | |
| "loss": 4.874, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.417925478348439, | |
| "grad_norm": 1.2470804452896118, | |
| "learning_rate": 1.134020618556701e-05, | |
| "loss": 4.8473, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4823766364551862, | |
| "grad_norm": 2.26425838470459, | |
| "learning_rate": 1.1855670103092785e-05, | |
| "loss": 4.7739, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5468277945619335, | |
| "grad_norm": 1.238208532333374, | |
| "learning_rate": 1.2371134020618558e-05, | |
| "loss": 4.7524, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.6112789526686808, | |
| "grad_norm": 0.9819815158843994, | |
| "learning_rate": 1.2886597938144332e-05, | |
| "loss": 4.6977, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.675730110775428, | |
| "grad_norm": 1.37124502658844, | |
| "learning_rate": 1.3402061855670103e-05, | |
| "loss": 4.5812, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.7401812688821754, | |
| "grad_norm": 1.0728330612182617, | |
| "learning_rate": 1.3917525773195878e-05, | |
| "loss": 4.5445, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.8046324269889225, | |
| "grad_norm": 1.4190095663070679, | |
| "learning_rate": 1.443298969072165e-05, | |
| "loss": 4.4797, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8690835850956697, | |
| "grad_norm": 1.3823643922805786, | |
| "learning_rate": 1.4948453608247425e-05, | |
| "loss": 4.3833, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.9335347432024168, | |
| "grad_norm": 1.6940747499465942, | |
| "learning_rate": 1.5463917525773197e-05, | |
| "loss": 4.2822, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9335347432024168, | |
| "eval_loss": 4.213192462921143, | |
| "eval_runtime": 19.7585, | |
| "eval_samples_per_second": 81.231, | |
| "eval_steps_per_second": 5.112, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.997985901309164, | |
| "grad_norm": 1.6004289388656616, | |
| "learning_rate": 1.597938144329897e-05, | |
| "loss": 4.1608, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0624370594159114, | |
| "grad_norm": 2.5717203617095947, | |
| "learning_rate": 1.6494845360824743e-05, | |
| "loss": 4.392, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.1268882175226587, | |
| "grad_norm": 1.9171743392944336, | |
| "learning_rate": 1.7010309278350517e-05, | |
| "loss": 3.9273, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.191339375629406, | |
| "grad_norm": 2.1511971950531006, | |
| "learning_rate": 1.752577319587629e-05, | |
| "loss": 3.8527, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.2557905337361532, | |
| "grad_norm": 1.949204683303833, | |
| "learning_rate": 1.8041237113402062e-05, | |
| "loss": 3.7843, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.3202416918429005, | |
| "grad_norm": 1.6854971647262573, | |
| "learning_rate": 1.8556701030927837e-05, | |
| "loss": 3.7476, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.3846928499496474, | |
| "grad_norm": 1.9099489450454712, | |
| "learning_rate": 1.907216494845361e-05, | |
| "loss": 3.6868, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4491440080563947, | |
| "grad_norm": 1.9068998098373413, | |
| "learning_rate": 1.9587628865979382e-05, | |
| "loss": 3.6684, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.513595166163142, | |
| "grad_norm": 1.3064167499542236, | |
| "learning_rate": 1.9999996358015542e-05, | |
| "loss": 3.6028, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5780463242698892, | |
| "grad_norm": 1.7267900705337524, | |
| "learning_rate": 1.9999868888837957e-05, | |
| "loss": 3.5599, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.5780463242698892, | |
| "eval_loss": 3.5509181022644043, | |
| "eval_runtime": 20.1957, | |
| "eval_samples_per_second": 79.472, | |
| "eval_steps_per_second": 5.001, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.6424974823766365, | |
| "grad_norm": 2.2463932037353516, | |
| "learning_rate": 1.9999559323090132e-05, | |
| "loss": 3.5059, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.706948640483384, | |
| "grad_norm": 2.046381950378418, | |
| "learning_rate": 1.9999067666409225e-05, | |
| "loss": 3.4585, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.771399798590131, | |
| "grad_norm": 1.8293402194976807, | |
| "learning_rate": 1.9998393927748257e-05, | |
| "loss": 3.4393, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.835850956696878, | |
| "grad_norm": 1.701651930809021, | |
| "learning_rate": 1.9997538119375938e-05, | |
| "loss": 3.3986, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.900302114803625, | |
| "grad_norm": 1.8549060821533203, | |
| "learning_rate": 1.9996500256876447e-05, | |
| "loss": 3.3568, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9647532729103725, | |
| "grad_norm": 2.38415789604187, | |
| "learning_rate": 1.999528035914915e-05, | |
| "loss": 3.3115, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.02920443101712, | |
| "grad_norm": 3.2878332138061523, | |
| "learning_rate": 1.9993878448408263e-05, | |
| "loss": 3.5941, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.093655589123867, | |
| "grad_norm": 2.5062038898468018, | |
| "learning_rate": 1.999229455018243e-05, | |
| "loss": 3.2588, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.1581067472306144, | |
| "grad_norm": 2.2001876831054688, | |
| "learning_rate": 1.9990528693314273e-05, | |
| "loss": 3.2381, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.2225579053373616, | |
| "grad_norm": 2.3333001136779785, | |
| "learning_rate": 1.9988580909959864e-05, | |
| "loss": 3.2271, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.2225579053373616, | |
| "eval_loss": 3.236539840698242, | |
| "eval_runtime": 19.5719, | |
| "eval_samples_per_second": 82.005, | |
| "eval_steps_per_second": 5.16, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.287009063444109, | |
| "grad_norm": 2.109981060028076, | |
| "learning_rate": 1.9986451235588135e-05, | |
| "loss": 3.2038, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.351460221550856, | |
| "grad_norm": 2.376877546310425, | |
| "learning_rate": 1.9984139708980228e-05, | |
| "loss": 3.1678, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.415911379657603, | |
| "grad_norm": 2.0894505977630615, | |
| "learning_rate": 1.9981646372228813e-05, | |
| "loss": 3.1701, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.4803625377643503, | |
| "grad_norm": 1.9505690336227417, | |
| "learning_rate": 1.997897127073728e-05, | |
| "loss": 3.1902, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.5448136958710976, | |
| "grad_norm": 3.200566530227661, | |
| "learning_rate": 1.997611445321896e-05, | |
| "loss": 3.1286, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.609264853977845, | |
| "grad_norm": 2.295381784439087, | |
| "learning_rate": 1.9973075971696195e-05, | |
| "loss": 3.1308, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.673716012084592, | |
| "grad_norm": 2.323788642883301, | |
| "learning_rate": 1.9969855881499413e-05, | |
| "loss": 3.1093, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.7381671701913395, | |
| "grad_norm": 1.9054023027420044, | |
| "learning_rate": 1.996645424126613e-05, | |
| "loss": 3.1029, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.8026183282980868, | |
| "grad_norm": 2.854268789291382, | |
| "learning_rate": 1.996287111293986e-05, | |
| "loss": 3.0843, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.8670694864048336, | |
| "grad_norm": 2.066882848739624, | |
| "learning_rate": 1.9959106561768988e-05, | |
| "loss": 3.0301, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.8670694864048336, | |
| "eval_loss": 3.056886672973633, | |
| "eval_runtime": 20.2056, | |
| "eval_samples_per_second": 79.433, | |
| "eval_steps_per_second": 4.999, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.931520644511581, | |
| "grad_norm": 2.220766544342041, | |
| "learning_rate": 1.9955160656305606e-05, | |
| "loss": 3.02, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.995971802618328, | |
| "grad_norm": 2.461122751235962, | |
| "learning_rate": 1.995103346840424e-05, | |
| "loss": 3.0121, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.0604229607250755, | |
| "grad_norm": 1.9937182664871216, | |
| "learning_rate": 1.9946725073220542e-05, | |
| "loss": 3.249, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.124874118831823, | |
| "grad_norm": 2.3354651927948, | |
| "learning_rate": 1.9942235549209955e-05, | |
| "loss": 2.9879, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.18932527693857, | |
| "grad_norm": 2.059208393096924, | |
| "learning_rate": 1.9937564978126233e-05, | |
| "loss": 2.987, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.253776435045317, | |
| "grad_norm": 2.804398775100708, | |
| "learning_rate": 1.9932713445019993e-05, | |
| "loss": 2.9377, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.318227593152065, | |
| "grad_norm": 2.1567623615264893, | |
| "learning_rate": 1.992768103823714e-05, | |
| "loss": 2.9478, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.382678751258812, | |
| "grad_norm": 2.021939992904663, | |
| "learning_rate": 1.9922467849417288e-05, | |
| "loss": 2.9119, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.447129909365559, | |
| "grad_norm": 1.5279889106750488, | |
| "learning_rate": 1.9917073973492055e-05, | |
| "loss": 2.9033, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.5115810674723065, | |
| "grad_norm": 1.7887712717056274, | |
| "learning_rate": 1.991149950868336e-05, | |
| "loss": 2.8944, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.5115810674723065, | |
| "eval_loss": 2.8965914249420166, | |
| "eval_runtime": 21.0307, | |
| "eval_samples_per_second": 76.317, | |
| "eval_steps_per_second": 4.802, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.576032225579054, | |
| "grad_norm": 1.7073420286178589, | |
| "learning_rate": 1.9905744556501627e-05, | |
| "loss": 2.8471, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.640483383685801, | |
| "grad_norm": 1.862641453742981, | |
| "learning_rate": 1.989980922174394e-05, | |
| "loss": 2.8432, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.704934541792548, | |
| "grad_norm": 1.9594634771347046, | |
| "learning_rate": 1.9893693612492116e-05, | |
| "loss": 2.8482, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.769385699899295, | |
| "grad_norm": 1.6772149801254272, | |
| "learning_rate": 1.988739784011077e-05, | |
| "loss": 2.8538, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.833836858006042, | |
| "grad_norm": 1.9590495824813843, | |
| "learning_rate": 1.9880922019245258e-05, | |
| "loss": 2.8703, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.898288016112789, | |
| "grad_norm": 1.4671125411987305, | |
| "learning_rate": 1.9874266267819604e-05, | |
| "loss": 2.8078, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.962739174219537, | |
| "grad_norm": 1.8790684938430786, | |
| "learning_rate": 1.986743070703435e-05, | |
| "loss": 2.779, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 5.027190332326284, | |
| "grad_norm": 1.6177160739898682, | |
| "learning_rate": 1.9860415461364343e-05, | |
| "loss": 3.0088, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 5.091641490433031, | |
| "grad_norm": 1.774277687072754, | |
| "learning_rate": 1.9853220658556474e-05, | |
| "loss": 2.7841, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 5.1560926485397784, | |
| "grad_norm": 1.4572412967681885, | |
| "learning_rate": 1.984584642962735e-05, | |
| "loss": 2.7748, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 5.1560926485397784, | |
| "eval_loss": 2.7890875339508057, | |
| "eval_runtime": 19.598, | |
| "eval_samples_per_second": 81.896, | |
| "eval_steps_per_second": 5.154, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 5.220543806646526, | |
| "grad_norm": 1.4425498247146606, | |
| "learning_rate": 1.9838292908860922e-05, | |
| "loss": 2.7712, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 5.284994964753273, | |
| "grad_norm": 1.4506257772445679, | |
| "learning_rate": 1.9830560233806006e-05, | |
| "loss": 2.758, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 5.34944612286002, | |
| "grad_norm": 1.3948203325271606, | |
| "learning_rate": 1.982264854527381e-05, | |
| "loss": 2.7487, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 5.413897280966768, | |
| "grad_norm": 1.170486330986023, | |
| "learning_rate": 1.9814557987335363e-05, | |
| "loss": 2.7558, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 5.478348439073515, | |
| "grad_norm": 1.3575382232666016, | |
| "learning_rate": 1.980628870731888e-05, | |
| "loss": 2.7266, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.542799597180262, | |
| "grad_norm": 1.3390381336212158, | |
| "learning_rate": 1.979784085580708e-05, | |
| "loss": 2.7156, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 5.6072507552870094, | |
| "grad_norm": 1.3795325756072998, | |
| "learning_rate": 1.978921458663447e-05, | |
| "loss": 2.7069, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 5.671701913393756, | |
| "grad_norm": 1.2567253112792969, | |
| "learning_rate": 1.9780410056884505e-05, | |
| "loss": 2.7121, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 5.736153071500503, | |
| "grad_norm": 1.6211382150650024, | |
| "learning_rate": 1.977142742688676e-05, | |
| "loss": 2.7006, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 5.80060422960725, | |
| "grad_norm": 1.5447043180465698, | |
| "learning_rate": 1.9762266860213982e-05, | |
| "loss": 2.6635, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.80060422960725, | |
| "eval_loss": 2.704418659210205, | |
| "eval_runtime": 20.2115, | |
| "eval_samples_per_second": 79.41, | |
| "eval_steps_per_second": 4.997, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.865055387713998, | |
| "grad_norm": 1.28032648563385, | |
| "learning_rate": 1.9752928523679145e-05, | |
| "loss": 2.7062, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 5.929506545820745, | |
| "grad_norm": 1.323886752128601, | |
| "learning_rate": 1.974341258733238e-05, | |
| "loss": 2.6955, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 5.993957703927492, | |
| "grad_norm": 1.370737910270691, | |
| "learning_rate": 1.9733719224457896e-05, | |
| "loss": 2.6607, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 6.05840886203424, | |
| "grad_norm": 1.2304973602294922, | |
| "learning_rate": 1.972384861157082e-05, | |
| "loss": 2.8645, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 6.122860020140987, | |
| "grad_norm": 1.251336693763733, | |
| "learning_rate": 1.9713800928413987e-05, | |
| "loss": 2.678, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 6.187311178247734, | |
| "grad_norm": 1.4331055879592896, | |
| "learning_rate": 1.9703576357954653e-05, | |
| "loss": 2.6339, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 6.251762336354481, | |
| "grad_norm": 1.4404268264770508, | |
| "learning_rate": 1.969317508638119e-05, | |
| "loss": 2.6497, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 6.316213494461229, | |
| "grad_norm": 1.2648017406463623, | |
| "learning_rate": 1.9682597303099663e-05, | |
| "loss": 2.6194, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 6.380664652567976, | |
| "grad_norm": 1.464142084121704, | |
| "learning_rate": 1.9671843200730408e-05, | |
| "loss": 2.6342, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 6.445115810674723, | |
| "grad_norm": 1.4572694301605225, | |
| "learning_rate": 1.96609129751045e-05, | |
| "loss": 2.6102, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.445115810674723, | |
| "eval_loss": 2.6344494819641113, | |
| "eval_runtime": 19.7801, | |
| "eval_samples_per_second": 81.142, | |
| "eval_steps_per_second": 5.106, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.509566968781471, | |
| "grad_norm": 1.3998351097106934, | |
| "learning_rate": 1.9649806825260215e-05, | |
| "loss": 2.6117, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 6.574018126888218, | |
| "grad_norm": 1.2323154211044312, | |
| "learning_rate": 1.9638524953439385e-05, | |
| "loss": 2.5907, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 6.638469284994965, | |
| "grad_norm": 1.5451122522354126, | |
| "learning_rate": 1.9627067565083716e-05, | |
| "loss": 2.5988, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 6.702920443101712, | |
| "grad_norm": 1.4288865327835083, | |
| "learning_rate": 1.9615434868831057e-05, | |
| "loss": 2.6088, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 6.76737160120846, | |
| "grad_norm": 1.4163669347763062, | |
| "learning_rate": 1.9603627076511595e-05, | |
| "loss": 2.5787, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 6.831822759315206, | |
| "grad_norm": 1.4155839681625366, | |
| "learning_rate": 1.9591644403143997e-05, | |
| "loss": 2.5802, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 6.896273917421953, | |
| "grad_norm": 1.2025197744369507, | |
| "learning_rate": 1.9579487066931495e-05, | |
| "loss": 2.56, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 6.960725075528701, | |
| "grad_norm": 1.331957459449768, | |
| "learning_rate": 1.956715528925792e-05, | |
| "loss": 2.5797, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 7.025176233635448, | |
| "grad_norm": 1.543091893196106, | |
| "learning_rate": 1.955464929468365e-05, | |
| "loss": 2.7584, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 7.089627391742195, | |
| "grad_norm": 1.4929845333099365, | |
| "learning_rate": 1.954196931094155e-05, | |
| "loss": 2.5724, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 7.089627391742195, | |
| "eval_loss": 2.5758652687072754, | |
| "eval_runtime": 19.7829, | |
| "eval_samples_per_second": 81.131, | |
| "eval_steps_per_second": 5.105, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 7.1540785498489425, | |
| "grad_norm": 1.3239967823028564, | |
| "learning_rate": 1.9529115568932796e-05, | |
| "loss": 2.5215, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 7.21852970795569, | |
| "grad_norm": 1.2997671365737915, | |
| "learning_rate": 1.9516088302722696e-05, | |
| "loss": 2.5256, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 7.282980866062437, | |
| "grad_norm": 1.4063647985458374, | |
| "learning_rate": 1.9502887749536406e-05, | |
| "loss": 2.5286, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 7.347432024169184, | |
| "grad_norm": 1.625854730606079, | |
| "learning_rate": 1.9489514149754624e-05, | |
| "loss": 2.5245, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 7.411883182275932, | |
| "grad_norm": 1.2255531549453735, | |
| "learning_rate": 1.9475967746909212e-05, | |
| "loss": 2.5373, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 7.476334340382679, | |
| "grad_norm": 1.3738540410995483, | |
| "learning_rate": 1.946224878767875e-05, | |
| "loss": 2.5108, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 7.540785498489426, | |
| "grad_norm": 1.7884373664855957, | |
| "learning_rate": 1.9448357521884057e-05, | |
| "loss": 2.512, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 7.6052366565961735, | |
| "grad_norm": 1.4379594326019287, | |
| "learning_rate": 1.9434294202483634e-05, | |
| "loss": 2.5219, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 7.669687814702921, | |
| "grad_norm": 1.5573759078979492, | |
| "learning_rate": 1.9420059085569062e-05, | |
| "loss": 2.4937, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 7.734138972809667, | |
| "grad_norm": 1.5050828456878662, | |
| "learning_rate": 1.940565243036034e-05, | |
| "loss": 2.5267, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 7.734138972809667, | |
| "eval_loss": 2.5181825160980225, | |
| "eval_runtime": 19.5726, | |
| "eval_samples_per_second": 82.002, | |
| "eval_steps_per_second": 5.16, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 7.7985901309164145, | |
| "grad_norm": 1.6680591106414795, | |
| "learning_rate": 1.9391074499201155e-05, | |
| "loss": 2.5034, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 7.863041289023162, | |
| "grad_norm": 1.1828731298446655, | |
| "learning_rate": 1.9376325557554113e-05, | |
| "loss": 2.493, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 7.927492447129909, | |
| "grad_norm": 1.5516493320465088, | |
| "learning_rate": 1.9361405873995904e-05, | |
| "loss": 2.4866, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 7.991943605236656, | |
| "grad_norm": 1.2538108825683594, | |
| "learning_rate": 1.9346315720212416e-05, | |
| "loss": 2.4935, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 8.056394763343404, | |
| "grad_norm": 1.4746509790420532, | |
| "learning_rate": 1.933105537099377e-05, | |
| "loss": 2.6784, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 8.120845921450151, | |
| "grad_norm": 1.491493821144104, | |
| "learning_rate": 1.9315625104229336e-05, | |
| "loss": 2.484, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 8.185297079556898, | |
| "grad_norm": 1.5967274904251099, | |
| "learning_rate": 1.9300025200902666e-05, | |
| "loss": 2.4746, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 8.249748237663646, | |
| "grad_norm": 1.477860689163208, | |
| "learning_rate": 1.928425594508637e-05, | |
| "loss": 2.4447, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 8.314199395770393, | |
| "grad_norm": 1.4457802772521973, | |
| "learning_rate": 1.9268317623936957e-05, | |
| "loss": 2.4438, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 8.37865055387714, | |
| "grad_norm": 1.1860662698745728, | |
| "learning_rate": 1.9252210527689596e-05, | |
| "loss": 2.4863, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 8.37865055387714, | |
| "eval_loss": 2.4702823162078857, | |
| "eval_runtime": 20.2158, | |
| "eval_samples_per_second": 79.393, | |
| "eval_steps_per_second": 4.996, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 8.443101711983887, | |
| "grad_norm": 1.36308753490448, | |
| "learning_rate": 1.9235934949652825e-05, | |
| "loss": 2.4382, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 8.507552870090635, | |
| "grad_norm": 1.0049257278442383, | |
| "learning_rate": 1.9219491186203222e-05, | |
| "loss": 2.4478, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 8.572004028197382, | |
| "grad_norm": 1.4377230405807495, | |
| "learning_rate": 1.9202879536780013e-05, | |
| "loss": 2.4276, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 8.63645518630413, | |
| "grad_norm": 1.3836121559143066, | |
| "learning_rate": 1.91861003038796e-05, | |
| "loss": 2.4464, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 8.700906344410877, | |
| "grad_norm": 1.3200100660324097, | |
| "learning_rate": 1.9169153793050065e-05, | |
| "loss": 2.438, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 8.765357502517624, | |
| "grad_norm": 1.4599627256393433, | |
| "learning_rate": 1.9152040312885604e-05, | |
| "loss": 2.4192, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 8.829808660624371, | |
| "grad_norm": 1.2848976850509644, | |
| "learning_rate": 1.9134760175020906e-05, | |
| "loss": 2.4119, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 8.894259818731118, | |
| "grad_norm": 1.195146083831787, | |
| "learning_rate": 1.9117313694125482e-05, | |
| "loss": 2.4047, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 8.958710976837866, | |
| "grad_norm": 1.5599223375320435, | |
| "learning_rate": 1.9099701187897927e-05, | |
| "loss": 2.4238, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 9.023162134944613, | |
| "grad_norm": 1.7016339302062988, | |
| "learning_rate": 1.9081922977060146e-05, | |
| "loss": 2.5924, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 9.023162134944613, | |
| "eval_loss": 2.425938844680786, | |
| "eval_runtime": 20.1925, | |
| "eval_samples_per_second": 79.485, | |
| "eval_steps_per_second": 5.002, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 9.08761329305136, | |
| "grad_norm": 1.5949656963348389, | |
| "learning_rate": 1.9063979385351512e-05, | |
| "loss": 2.4341, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 9.152064451158108, | |
| "grad_norm": 1.4340884685516357, | |
| "learning_rate": 1.9045870739522953e-05, | |
| "loss": 2.4071, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 9.216515609264855, | |
| "grad_norm": 1.2564432621002197, | |
| "learning_rate": 1.902759736933102e-05, | |
| "loss": 2.3866, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 9.280966767371602, | |
| "grad_norm": 1.338267207145691, | |
| "learning_rate": 1.9009159607531886e-05, | |
| "loss": 2.3687, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 9.345417925478348, | |
| "grad_norm": 1.4795355796813965, | |
| "learning_rate": 1.8990557789875265e-05, | |
| "loss": 2.3932, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 9.409869083585095, | |
| "grad_norm": 1.3495395183563232, | |
| "learning_rate": 1.8971792255098326e-05, | |
| "loss": 2.3738, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 9.474320241691842, | |
| "grad_norm": 1.5323916673660278, | |
| "learning_rate": 1.8952863344919495e-05, | |
| "loss": 2.3773, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 9.53877139979859, | |
| "grad_norm": 1.2470097541809082, | |
| "learning_rate": 1.893377140403225e-05, | |
| "loss": 2.364, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 9.603222557905337, | |
| "grad_norm": 1.2765355110168457, | |
| "learning_rate": 1.891451678009886e-05, | |
| "loss": 2.3414, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 9.667673716012084, | |
| "grad_norm": 1.2045294046401978, | |
| "learning_rate": 1.8895099823744005e-05, | |
| "loss": 2.3869, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 9.667673716012084, | |
| "eval_loss": 2.387402057647705, | |
| "eval_runtime": 20.2004, | |
| "eval_samples_per_second": 79.454, | |
| "eval_steps_per_second": 5.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 9.732124874118831, | |
| "grad_norm": 1.488773226737976, | |
| "learning_rate": 1.887552088854844e-05, | |
| "loss": 2.3653, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 9.796576032225579, | |
| "grad_norm": 1.442879319190979, | |
| "learning_rate": 1.8855780331042538e-05, | |
| "loss": 2.3483, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 9.861027190332326, | |
| "grad_norm": 1.2660679817199707, | |
| "learning_rate": 1.8835878510699793e-05, | |
| "loss": 2.3569, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 9.925478348439073, | |
| "grad_norm": 1.575183391571045, | |
| "learning_rate": 1.8815815789930277e-05, | |
| "loss": 2.3385, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 9.98992950654582, | |
| "grad_norm": 1.270186185836792, | |
| "learning_rate": 1.8795592534074045e-05, | |
| "loss": 2.3276, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 10.054380664652568, | |
| "grad_norm": 1.4595534801483154, | |
| "learning_rate": 1.877520911139448e-05, | |
| "loss": 2.5222, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 10.118831822759315, | |
| "grad_norm": 1.2067569494247437, | |
| "learning_rate": 1.8754665893071583e-05, | |
| "loss": 2.3175, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 10.183282980866062, | |
| "grad_norm": 1.1168224811553955, | |
| "learning_rate": 1.8733963253195217e-05, | |
| "loss": 2.3505, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 10.24773413897281, | |
| "grad_norm": 1.3407734632492065, | |
| "learning_rate": 1.8713101568758295e-05, | |
| "loss": 2.3406, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 10.312185297079557, | |
| "grad_norm": 1.2485662698745728, | |
| "learning_rate": 1.8692081219649926e-05, | |
| "loss": 2.3084, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 10.312185297079557, | |
| "eval_loss": 2.3460793495178223, | |
| "eval_runtime": 19.8125, | |
| "eval_samples_per_second": 81.009, | |
| "eval_steps_per_second": 5.098, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 10.376636455186304, | |
| "grad_norm": 1.335404872894287, | |
| "learning_rate": 1.8670902588648467e-05, | |
| "loss": 2.3151, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 10.441087613293051, | |
| "grad_norm": 1.5283225774765015, | |
| "learning_rate": 1.8649566061414583e-05, | |
| "loss": 2.3153, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 10.505538771399799, | |
| "grad_norm": 1.2574470043182373, | |
| "learning_rate": 1.8628072026484215e-05, | |
| "loss": 2.303, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 10.569989929506546, | |
| "grad_norm": 1.3589491844177246, | |
| "learning_rate": 1.8606420875261492e-05, | |
| "loss": 2.3191, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 10.634441087613293, | |
| "grad_norm": 1.3146897554397583, | |
| "learning_rate": 1.858461300201163e-05, | |
| "loss": 2.3128, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 10.69889224572004, | |
| "grad_norm": 1.363336205482483, | |
| "learning_rate": 1.856264880385372e-05, | |
| "loss": 2.2795, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 10.763343403826788, | |
| "grad_norm": 1.402933120727539, | |
| "learning_rate": 1.8540528680753525e-05, | |
| "loss": 2.2819, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 10.827794561933535, | |
| "grad_norm": 1.199621558189392, | |
| "learning_rate": 1.851825303551618e-05, | |
| "loss": 2.2937, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 10.892245720040282, | |
| "grad_norm": 1.4294910430908203, | |
| "learning_rate": 1.8495822273778867e-05, | |
| "loss": 2.2985, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 10.95669687814703, | |
| "grad_norm": 1.211239218711853, | |
| "learning_rate": 1.8473236804003412e-05, | |
| "loss": 2.2803, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 10.95669687814703, | |
| "eval_loss": 2.2990825176239014, | |
| "eval_runtime": 19.7801, | |
| "eval_samples_per_second": 81.142, | |
| "eval_steps_per_second": 5.106, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 11.021148036253777, | |
| "grad_norm": 1.610541582107544, | |
| "learning_rate": 1.8450497037468876e-05, | |
| "loss": 2.4507, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 11.085599194360524, | |
| "grad_norm": 1.265767216682434, | |
| "learning_rate": 1.8427603388264027e-05, | |
| "loss": 2.2681, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 11.150050352467272, | |
| "grad_norm": 1.1170353889465332, | |
| "learning_rate": 1.8404556273279835e-05, | |
| "loss": 2.2513, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 11.214501510574019, | |
| "grad_norm": 1.5260467529296875, | |
| "learning_rate": 1.8381356112201863e-05, | |
| "loss": 2.2696, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 11.278952668680766, | |
| "grad_norm": 1.3105249404907227, | |
| "learning_rate": 1.835800332750263e-05, | |
| "loss": 2.2703, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 11.343403826787512, | |
| "grad_norm": 1.692631483078003, | |
| "learning_rate": 1.8334498344433903e-05, | |
| "loss": 2.2637, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 11.407854984894259, | |
| "grad_norm": 1.5331857204437256, | |
| "learning_rate": 1.8310841591018977e-05, | |
| "loss": 2.2499, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 11.472306143001006, | |
| "grad_norm": 1.3038716316223145, | |
| "learning_rate": 1.828703349804487e-05, | |
| "loss": 2.2658, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 11.536757301107754, | |
| "grad_norm": 1.1918790340423584, | |
| "learning_rate": 1.826307449905447e-05, | |
| "loss": 2.2517, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 11.6012084592145, | |
| "grad_norm": 1.4929193258285522, | |
| "learning_rate": 1.823896503033865e-05, | |
| "loss": 2.2393, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 11.6012084592145, | |
| "eval_loss": 2.256037712097168, | |
| "eval_runtime": 21.0369, | |
| "eval_samples_per_second": 76.294, | |
| "eval_steps_per_second": 4.801, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 11.665659617321248, | |
| "grad_norm": 1.3841142654418945, | |
| "learning_rate": 1.8214705530928322e-05, | |
| "loss": 2.228, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 11.730110775427995, | |
| "grad_norm": 1.3609843254089355, | |
| "learning_rate": 1.819029644258645e-05, | |
| "loss": 2.2211, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 11.794561933534743, | |
| "grad_norm": 1.7460395097732544, | |
| "learning_rate": 1.816573820979998e-05, | |
| "loss": 2.2081, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 11.85901309164149, | |
| "grad_norm": 1.2562885284423828, | |
| "learning_rate": 1.8141031279771777e-05, | |
| "loss": 2.2201, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 11.923464249748237, | |
| "grad_norm": 1.5480189323425293, | |
| "learning_rate": 1.811617610241246e-05, | |
| "loss": 2.2099, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 11.987915407854985, | |
| "grad_norm": 1.1817470788955688, | |
| "learning_rate": 1.8091173130332214e-05, | |
| "loss": 2.1935, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 12.052366565961732, | |
| "grad_norm": 1.283387541770935, | |
| "learning_rate": 1.8066022818832564e-05, | |
| "loss": 2.3754, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 12.11681772406848, | |
| "grad_norm": 1.4642541408538818, | |
| "learning_rate": 1.804072562589805e-05, | |
| "loss": 2.1914, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 12.181268882175226, | |
| "grad_norm": 1.3333503007888794, | |
| "learning_rate": 1.8015282012187927e-05, | |
| "loss": 2.1918, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 12.245720040281974, | |
| "grad_norm": 1.4526790380477905, | |
| "learning_rate": 1.7989692441027744e-05, | |
| "loss": 2.1748, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 12.245720040281974, | |
| "eval_loss": 2.212247133255005, | |
| "eval_runtime": 20.2138, | |
| "eval_samples_per_second": 79.401, | |
| "eval_steps_per_second": 4.997, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 12.310171198388721, | |
| "grad_norm": 1.2756415605545044, | |
| "learning_rate": 1.796395737840093e-05, | |
| "loss": 2.2262, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 12.374622356495468, | |
| "grad_norm": 1.2323824167251587, | |
| "learning_rate": 1.7938077292940288e-05, | |
| "loss": 2.1796, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 12.439073514602216, | |
| "grad_norm": 1.2132095098495483, | |
| "learning_rate": 1.7912052655919478e-05, | |
| "loss": 2.1934, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 12.503524672708963, | |
| "grad_norm": 1.45913827419281, | |
| "learning_rate": 1.7885883941244432e-05, | |
| "loss": 2.169, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 12.56797583081571, | |
| "grad_norm": 1.398886799812317, | |
| "learning_rate": 1.7859571625444712e-05, | |
| "loss": 2.1845, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 12.632426988922457, | |
| "grad_norm": 1.386767029762268, | |
| "learning_rate": 1.7833116187664846e-05, | |
| "loss": 2.1563, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 12.696878147029205, | |
| "grad_norm": 1.4831831455230713, | |
| "learning_rate": 1.7806518109655604e-05, | |
| "loss": 2.1592, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 12.761329305135952, | |
| "grad_norm": 1.3067753314971924, | |
| "learning_rate": 1.777977787576521e-05, | |
| "loss": 2.1587, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 12.8257804632427, | |
| "grad_norm": 1.4871938228607178, | |
| "learning_rate": 1.7752895972930538e-05, | |
| "loss": 2.1575, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 12.890231621349447, | |
| "grad_norm": 1.3559268712997437, | |
| "learning_rate": 1.772587289066823e-05, | |
| "loss": 2.1365, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 12.890231621349447, | |
| "eval_loss": 2.167628526687622, | |
| "eval_runtime": 19.7767, | |
| "eval_samples_per_second": 81.156, | |
| "eval_steps_per_second": 5.107, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 12.954682779456194, | |
| "grad_norm": 1.4033281803131104, | |
| "learning_rate": 1.769870912106581e-05, | |
| "loss": 2.1484, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 13.019133937562941, | |
| "grad_norm": 1.3120031356811523, | |
| "learning_rate": 1.7671405158772686e-05, | |
| "loss": 2.3176, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 13.083585095669688, | |
| "grad_norm": 1.3632259368896484, | |
| "learning_rate": 1.764396150099116e-05, | |
| "loss": 2.1399, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 13.148036253776436, | |
| "grad_norm": 1.3641666173934937, | |
| "learning_rate": 1.7616378647467387e-05, | |
| "loss": 2.1302, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 13.212487411883183, | |
| "grad_norm": 1.2037588357925415, | |
| "learning_rate": 1.758865710048225e-05, | |
| "loss": 2.1169, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 13.27693856998993, | |
| "grad_norm": 1.3655686378479004, | |
| "learning_rate": 1.7560797364842235e-05, | |
| "loss": 2.1228, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 13.341389728096678, | |
| "grad_norm": 1.2204645872116089, | |
| "learning_rate": 1.7532799947870224e-05, | |
| "loss": 2.1027, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 13.405840886203425, | |
| "grad_norm": 1.3521157503128052, | |
| "learning_rate": 1.7504665359396255e-05, | |
| "loss": 2.1038, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 13.47029204431017, | |
| "grad_norm": 1.3395577669143677, | |
| "learning_rate": 1.7476394111748262e-05, | |
| "loss": 2.0887, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 13.534743202416918, | |
| "grad_norm": 1.427236795425415, | |
| "learning_rate": 1.7447986719742708e-05, | |
| "loss": 2.1054, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 13.534743202416918, | |
| "eval_loss": 2.1201364994049072, | |
| "eval_runtime": 19.5957, | |
| "eval_samples_per_second": 81.906, | |
| "eval_steps_per_second": 5.154, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 13.599194360523665, | |
| "grad_norm": 1.7869354486465454, | |
| "learning_rate": 1.7419443700675248e-05, | |
| "loss": 2.0844, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 13.663645518630412, | |
| "grad_norm": 1.4408442974090576, | |
| "learning_rate": 1.7390765574311287e-05, | |
| "loss": 2.1039, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 13.72809667673716, | |
| "grad_norm": 1.5654326677322388, | |
| "learning_rate": 1.7361952862876505e-05, | |
| "loss": 2.0889, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 13.792547834843907, | |
| "grad_norm": 1.5477581024169922, | |
| "learning_rate": 1.7333006091047386e-05, | |
| "loss": 2.0645, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 13.856998992950654, | |
| "grad_norm": 1.3517593145370483, | |
| "learning_rate": 1.730392578594162e-05, | |
| "loss": 2.0899, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 13.921450151057401, | |
| "grad_norm": 1.4705018997192383, | |
| "learning_rate": 1.7274712477108538e-05, | |
| "loss": 2.0876, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 13.985901309164149, | |
| "grad_norm": 1.5158582925796509, | |
| "learning_rate": 1.7245366696519448e-05, | |
| "loss": 2.0757, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 14.050352467270896, | |
| "grad_norm": 1.5032602548599243, | |
| "learning_rate": 1.7215888978557953e-05, | |
| "loss": 2.2535, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 14.114803625377643, | |
| "grad_norm": 1.3072115182876587, | |
| "learning_rate": 1.7186279860010228e-05, | |
| "loss": 2.0597, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 14.17925478348439, | |
| "grad_norm": 1.522055745124817, | |
| "learning_rate": 1.7156539880055236e-05, | |
| "loss": 2.0326, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 14.17925478348439, | |
| "eval_loss": 2.0740458965301514, | |
| "eval_runtime": 19.8046, | |
| "eval_samples_per_second": 81.042, | |
| "eval_steps_per_second": 5.1, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 14.243705941591138, | |
| "grad_norm": 1.2756038904190063, | |
| "learning_rate": 1.7126669580254908e-05, | |
| "loss": 2.0255, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 14.308157099697885, | |
| "grad_norm": 1.6488431692123413, | |
| "learning_rate": 1.7096669504544293e-05, | |
| "loss": 2.0271, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 14.372608257804632, | |
| "grad_norm": 1.4515522718429565, | |
| "learning_rate": 1.706654019922164e-05, | |
| "loss": 2.055, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 14.43705941591138, | |
| "grad_norm": 1.530522108078003, | |
| "learning_rate": 1.7036282212938468e-05, | |
| "loss": 2.0461, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 14.501510574018127, | |
| "grad_norm": 1.4453781843185425, | |
| "learning_rate": 1.7005896096689544e-05, | |
| "loss": 2.0376, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 14.565961732124874, | |
| "grad_norm": 1.578723430633545, | |
| "learning_rate": 1.697538240380288e-05, | |
| "loss": 2.0439, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 14.630412890231622, | |
| "grad_norm": 1.3801120519638062, | |
| "learning_rate": 1.6944741689929646e-05, | |
| "loss": 2.0133, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 14.694864048338369, | |
| "grad_norm": 1.2451757192611694, | |
| "learning_rate": 1.6913974513034046e-05, | |
| "loss": 2.014, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 14.759315206445116, | |
| "grad_norm": 1.3197917938232422, | |
| "learning_rate": 1.6883081433383163e-05, | |
| "loss": 2.0191, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 14.823766364551863, | |
| "grad_norm": 1.5146923065185547, | |
| "learning_rate": 1.6852063013536765e-05, | |
| "loss": 1.992, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 14.823766364551863, | |
| "eval_loss": 2.0283043384552, | |
| "eval_runtime": 20.2251, | |
| "eval_samples_per_second": 79.357, | |
| "eval_steps_per_second": 4.994, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 14.88821752265861, | |
| "grad_norm": 1.3689446449279785, | |
| "learning_rate": 1.6820919818337035e-05, | |
| "loss": 1.9953, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 14.952668680765358, | |
| "grad_norm": 1.3700356483459473, | |
| "learning_rate": 1.6789652414898315e-05, | |
| "loss": 2.0154, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 15.017119838872105, | |
| "grad_norm": 1.4734395742416382, | |
| "learning_rate": 1.6758261372596768e-05, | |
| "loss": 2.1667, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 15.081570996978853, | |
| "grad_norm": 1.5056065320968628, | |
| "learning_rate": 1.6726747263059996e-05, | |
| "loss": 1.9786, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 15.1460221550856, | |
| "grad_norm": 1.51126229763031, | |
| "learning_rate": 1.6695110660156652e-05, | |
| "loss": 1.9809, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 15.210473313192347, | |
| "grad_norm": 1.5719977617263794, | |
| "learning_rate": 1.6663352139985977e-05, | |
| "loss": 1.9794, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 15.274924471299094, | |
| "grad_norm": 1.413379192352295, | |
| "learning_rate": 1.6631472280867314e-05, | |
| "loss": 1.9688, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 15.339375629405842, | |
| "grad_norm": 1.4571571350097656, | |
| "learning_rate": 1.6599471663329577e-05, | |
| "loss": 1.9489, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 15.403826787512589, | |
| "grad_norm": 1.4078450202941895, | |
| "learning_rate": 1.656735087010067e-05, | |
| "loss": 1.9597, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 15.468277945619334, | |
| "grad_norm": 1.5099396705627441, | |
| "learning_rate": 1.653511048609689e-05, | |
| "loss": 1.9692, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 15.468277945619334, | |
| "eval_loss": 1.986576795578003, | |
| "eval_runtime": 20.2391, | |
| "eval_samples_per_second": 79.302, | |
| "eval_steps_per_second": 4.99, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 15.532729103726084, | |
| "grad_norm": 1.317844271659851, | |
| "learning_rate": 1.6502751098412282e-05, | |
| "loss": 1.9565, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 15.597180261832829, | |
| "grad_norm": 1.5470753908157349, | |
| "learning_rate": 1.6470273296307907e-05, | |
| "loss": 1.9569, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 15.661631419939576, | |
| "grad_norm": 1.732040524482727, | |
| "learning_rate": 1.643767767120117e-05, | |
| "loss": 1.9452, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 15.726082578046324, | |
| "grad_norm": 1.4074490070343018, | |
| "learning_rate": 1.6404964816654993e-05, | |
| "loss": 1.9543, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 15.79053373615307, | |
| "grad_norm": 1.6578551530838013, | |
| "learning_rate": 1.6372135328367058e-05, | |
| "loss": 1.9372, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 15.854984894259818, | |
| "grad_norm": 1.514443039894104, | |
| "learning_rate": 1.6339189804158922e-05, | |
| "loss": 1.9374, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 15.919436052366565, | |
| "grad_norm": 1.6120061874389648, | |
| "learning_rate": 1.630612884396515e-05, | |
| "loss": 1.962, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 15.983887210473313, | |
| "grad_norm": 1.9274513721466064, | |
| "learning_rate": 1.6272953049822376e-05, | |
| "loss": 1.9457, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 16.048338368580062, | |
| "grad_norm": 1.4026292562484741, | |
| "learning_rate": 1.6239663025858356e-05, | |
| "loss": 2.1124, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 16.112789526686807, | |
| "grad_norm": 1.543957233428955, | |
| "learning_rate": 1.6206259378280956e-05, | |
| "loss": 1.9235, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 16.112789526686807, | |
| "eval_loss": 1.9452571868896484, | |
| "eval_runtime": 20.1977, | |
| "eval_samples_per_second": 79.465, | |
| "eval_steps_per_second": 5.001, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 16.177240684793556, | |
| "grad_norm": 1.6722089052200317, | |
| "learning_rate": 1.6172742715367124e-05, | |
| "loss": 1.8982, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 16.241691842900302, | |
| "grad_norm": 1.649903416633606, | |
| "learning_rate": 1.613911364745179e-05, | |
| "loss": 1.9176, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 16.30614300100705, | |
| "grad_norm": 1.3199961185455322, | |
| "learning_rate": 1.6105372786916776e-05, | |
| "loss": 1.9226, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 16.370594159113796, | |
| "grad_norm": 1.513606309890747, | |
| "learning_rate": 1.607152074817964e-05, | |
| "loss": 1.9141, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 16.435045317220546, | |
| "grad_norm": 1.3551160097122192, | |
| "learning_rate": 1.6037558147682473e-05, | |
| "loss": 1.9051, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 16.49949647532729, | |
| "grad_norm": 1.5009561777114868, | |
| "learning_rate": 1.60034856038807e-05, | |
| "loss": 1.9184, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 16.56394763343404, | |
| "grad_norm": 1.4896663427352905, | |
| "learning_rate": 1.5969303737231786e-05, | |
| "loss": 1.9005, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 16.628398791540786, | |
| "grad_norm": 1.4002718925476074, | |
| "learning_rate": 1.593501317018396e-05, | |
| "loss": 1.9057, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 16.69284994964753, | |
| "grad_norm": 1.5425680875778198, | |
| "learning_rate": 1.5900614527164876e-05, | |
| "loss": 1.8948, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 16.75730110775428, | |
| "grad_norm": 1.4177629947662354, | |
| "learning_rate": 1.586610843457024e-05, | |
| "loss": 1.8748, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 16.75730110775428, | |
| "eval_loss": 1.913808822631836, | |
| "eval_runtime": 19.5797, | |
| "eval_samples_per_second": 81.973, | |
| "eval_steps_per_second": 5.158, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 16.821752265861026, | |
| "grad_norm": 1.6956419944763184, | |
| "learning_rate": 1.5831495520752395e-05, | |
| "loss": 1.8708, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 16.886203423967775, | |
| "grad_norm": 1.5349304676055908, | |
| "learning_rate": 1.5796776416008897e-05, | |
| "loss": 1.8814, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 16.95065458207452, | |
| "grad_norm": 1.4432286024093628, | |
| "learning_rate": 1.5761951752571032e-05, | |
| "loss": 1.8622, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 17.01510574018127, | |
| "grad_norm": 1.4395033121109009, | |
| "learning_rate": 1.5727022164592282e-05, | |
| "loss": 2.0162, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 17.079556898288015, | |
| "grad_norm": 1.451162338256836, | |
| "learning_rate": 1.569198828813681e-05, | |
| "loss": 1.8803, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 17.144008056394764, | |
| "grad_norm": 1.6109211444854736, | |
| "learning_rate": 1.5656850761167848e-05, | |
| "loss": 1.8511, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 17.20845921450151, | |
| "grad_norm": 1.504939079284668, | |
| "learning_rate": 1.562161022353611e-05, | |
| "loss": 1.8727, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 17.27291037260826, | |
| "grad_norm": 1.2910106182098389, | |
| "learning_rate": 1.55862673169681e-05, | |
| "loss": 1.859, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 17.337361530715004, | |
| "grad_norm": 1.4325050115585327, | |
| "learning_rate": 1.5550822685054475e-05, | |
| "loss": 1.874, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 17.401812688821753, | |
| "grad_norm": 1.5667049884796143, | |
| "learning_rate": 1.5515276973238286e-05, | |
| "loss": 1.8678, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 17.401812688821753, | |
| "eval_loss": 1.879951000213623, | |
| "eval_runtime": 21.041, | |
| "eval_samples_per_second": 76.28, | |
| "eval_steps_per_second": 4.8, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 17.4662638469285, | |
| "grad_norm": 1.5697641372680664, | |
| "learning_rate": 1.5479630828803235e-05, | |
| "loss": 1.8341, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 17.530715005035248, | |
| "grad_norm": 1.5967832803726196, | |
| "learning_rate": 1.5443884900861904e-05, | |
| "loss": 1.8426, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 17.595166163141993, | |
| "grad_norm": 1.2664175033569336, | |
| "learning_rate": 1.5408039840343903e-05, | |
| "loss": 1.8516, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 17.659617321248742, | |
| "grad_norm": 1.5191247463226318, | |
| "learning_rate": 1.5372096299984064e-05, | |
| "loss": 1.8435, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 17.724068479355488, | |
| "grad_norm": 1.5485187768936157, | |
| "learning_rate": 1.5336054934310502e-05, | |
| "loss": 1.8551, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 17.788519637462237, | |
| "grad_norm": 1.3325048685073853, | |
| "learning_rate": 1.5299916399632726e-05, | |
| "loss": 1.8329, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 17.852970795568982, | |
| "grad_norm": 1.4892091751098633, | |
| "learning_rate": 1.5263681354029694e-05, | |
| "loss": 1.8275, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 17.91742195367573, | |
| "grad_norm": 1.3611677885055542, | |
| "learning_rate": 1.5227350457337809e-05, | |
| "loss": 1.8343, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 17.981873111782477, | |
| "grad_norm": 1.3932119607925415, | |
| "learning_rate": 1.5190924371138908e-05, | |
| "loss": 1.8319, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 18.046324269889226, | |
| "grad_norm": 1.4689136743545532, | |
| "learning_rate": 1.5154403758748228e-05, | |
| "loss": 1.9983, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 18.046324269889226, | |
| "eval_loss": 1.8525444269180298, | |
| "eval_runtime": 19.7876, | |
| "eval_samples_per_second": 81.111, | |
| "eval_steps_per_second": 5.104, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 18.11077542799597, | |
| "grad_norm": 1.7211060523986816, | |
| "learning_rate": 1.5117789285202313e-05, | |
| "loss": 1.8104, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 18.17522658610272, | |
| "grad_norm": 1.6318062543869019, | |
| "learning_rate": 1.5081081617246912e-05, | |
| "loss": 1.821, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 18.239677744209466, | |
| "grad_norm": 1.2402304410934448, | |
| "learning_rate": 1.5044281423324826e-05, | |
| "loss": 1.8104, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 18.304128902316215, | |
| "grad_norm": 1.438472032546997, | |
| "learning_rate": 1.500738937356376e-05, | |
| "loss": 1.8259, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 18.36858006042296, | |
| "grad_norm": 1.4083638191223145, | |
| "learning_rate": 1.4970406139764092e-05, | |
| "loss": 1.808, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 18.43303121852971, | |
| "grad_norm": 1.2466082572937012, | |
| "learning_rate": 1.4933332395386652e-05, | |
| "loss": 1.8232, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 18.497482376636455, | |
| "grad_norm": 1.2091064453125, | |
| "learning_rate": 1.4896168815540464e-05, | |
| "loss": 1.812, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 18.561933534743204, | |
| "grad_norm": 1.6740643978118896, | |
| "learning_rate": 1.4858916076970444e-05, | |
| "loss": 1.7973, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 18.62638469284995, | |
| "grad_norm": 1.2571892738342285, | |
| "learning_rate": 1.4821574858045073e-05, | |
| "loss": 1.8096, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 18.690835850956695, | |
| "grad_norm": 1.3580372333526611, | |
| "learning_rate": 1.4784145838744067e-05, | |
| "loss": 1.8117, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 18.690835850956695, | |
| "eval_loss": 1.8296641111373901, | |
| "eval_runtime": 22.6507, | |
| "eval_samples_per_second": 70.859, | |
| "eval_steps_per_second": 4.459, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 18.755287009063444, | |
| "grad_norm": 1.4074209928512573, | |
| "learning_rate": 1.4746629700645955e-05, | |
| "loss": 1.8154, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 18.81973816717019, | |
| "grad_norm": 1.4824714660644531, | |
| "learning_rate": 1.470902712691571e-05, | |
| "loss": 1.8, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 18.88418932527694, | |
| "grad_norm": 1.4426990747451782, | |
| "learning_rate": 1.4671338802292274e-05, | |
| "loss": 1.7956, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 18.948640483383684, | |
| "grad_norm": 1.4474570751190186, | |
| "learning_rate": 1.4633565413076114e-05, | |
| "loss": 1.7948, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 19.013091641490433, | |
| "grad_norm": 1.2493271827697754, | |
| "learning_rate": 1.4595707647116713e-05, | |
| "loss": 1.9644, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 19.07754279959718, | |
| "grad_norm": 1.4267578125, | |
| "learning_rate": 1.4557766193800036e-05, | |
| "loss": 1.781, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 19.141993957703928, | |
| "grad_norm": 1.355020523071289, | |
| "learning_rate": 1.4519741744036e-05, | |
| "loss": 1.7878, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 19.206445115810673, | |
| "grad_norm": 1.3975870609283447, | |
| "learning_rate": 1.4481634990245871e-05, | |
| "loss": 1.7899, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 19.270896273917423, | |
| "grad_norm": 1.426984190940857, | |
| "learning_rate": 1.4443446626349662e-05, | |
| "loss": 1.775, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 19.335347432024168, | |
| "grad_norm": 1.5936192274093628, | |
| "learning_rate": 1.4405177347753503e-05, | |
| "loss": 1.7697, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 19.335347432024168, | |
| "eval_loss": 1.8065377473831177, | |
| "eval_runtime": 20.2136, | |
| "eval_samples_per_second": 79.402, | |
| "eval_steps_per_second": 4.997, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 19.399798590130917, | |
| "grad_norm": 1.3064271211624146, | |
| "learning_rate": 1.4366827851336964e-05, | |
| "loss": 1.7844, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 19.464249748237663, | |
| "grad_norm": 1.2223880290985107, | |
| "learning_rate": 1.4328398835440381e-05, | |
| "loss": 1.7739, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 19.52870090634441, | |
| "grad_norm": 1.5116280317306519, | |
| "learning_rate": 1.4289890999852126e-05, | |
| "loss": 1.7707, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 19.593152064451157, | |
| "grad_norm": 1.3726651668548584, | |
| "learning_rate": 1.4251305045795874e-05, | |
| "loss": 1.7789, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 19.657603222557906, | |
| "grad_norm": 1.4748300313949585, | |
| "learning_rate": 1.4212641675917823e-05, | |
| "loss": 1.7715, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 19.72205438066465, | |
| "grad_norm": 1.4205374717712402, | |
| "learning_rate": 1.4173901594273917e-05, | |
| "loss": 1.7823, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 19.7865055387714, | |
| "grad_norm": 1.2562229633331299, | |
| "learning_rate": 1.4135085506316997e-05, | |
| "loss": 1.7643, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 19.850956696878146, | |
| "grad_norm": 1.3158072233200073, | |
| "learning_rate": 1.4096194118883982e-05, | |
| "loss": 1.7599, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 19.915407854984895, | |
| "grad_norm": 1.3303531408309937, | |
| "learning_rate": 1.4057228140182982e-05, | |
| "loss": 1.7701, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 19.97985901309164, | |
| "grad_norm": 1.4447650909423828, | |
| "learning_rate": 1.4018188279780412e-05, | |
| "loss": 1.7587, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 19.97985901309164, | |
| "eval_loss": 1.7869038581848145, | |
| "eval_runtime": 19.8118, | |
| "eval_samples_per_second": 81.012, | |
| "eval_steps_per_second": 5.098, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 20.04431017119839, | |
| "grad_norm": 1.4862005710601807, | |
| "learning_rate": 1.3979075248588054e-05, | |
| "loss": 1.9169, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 20.108761329305135, | |
| "grad_norm": 1.2900787591934204, | |
| "learning_rate": 1.3939889758850138e-05, | |
| "loss": 1.7603, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 20.173212487411885, | |
| "grad_norm": 1.1818557977676392, | |
| "learning_rate": 1.3900632524130343e-05, | |
| "loss": 1.7501, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 20.23766364551863, | |
| "grad_norm": 1.4612010717391968, | |
| "learning_rate": 1.3861304259298823e-05, | |
| "loss": 1.7412, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 20.30211480362538, | |
| "grad_norm": 1.2869311571121216, | |
| "learning_rate": 1.3821905680519181e-05, | |
| "loss": 1.7442, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 20.366565961732125, | |
| "grad_norm": 1.4396709203720093, | |
| "learning_rate": 1.378243750523543e-05, | |
| "loss": 1.7634, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 20.431017119838874, | |
| "grad_norm": 1.2522846460342407, | |
| "learning_rate": 1.3742900452158932e-05, | |
| "loss": 1.7422, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 20.49546827794562, | |
| "grad_norm": 1.4543647766113281, | |
| "learning_rate": 1.3703295241255296e-05, | |
| "loss": 1.75, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 20.55991943605237, | |
| "grad_norm": 1.191945195198059, | |
| "learning_rate": 1.3663622593731294e-05, | |
| "loss": 1.7302, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 20.624370594159114, | |
| "grad_norm": 1.4461244344711304, | |
| "learning_rate": 1.3623883232021693e-05, | |
| "loss": 1.7381, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 20.624370594159114, | |
| "eval_loss": 1.77000892162323, | |
| "eval_runtime": 19.6254, | |
| "eval_samples_per_second": 81.782, | |
| "eval_steps_per_second": 5.146, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 20.68882175226586, | |
| "grad_norm": 1.2070411443710327, | |
| "learning_rate": 1.3584077879776132e-05, | |
| "loss": 1.739, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 20.75327291037261, | |
| "grad_norm": 1.3720213174819946, | |
| "learning_rate": 1.3544207261845928e-05, | |
| "loss": 1.7366, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 20.817724068479354, | |
| "grad_norm": 1.3589030504226685, | |
| "learning_rate": 1.3504272104270876e-05, | |
| "loss": 1.7427, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 20.882175226586103, | |
| "grad_norm": 1.1105351448059082, | |
| "learning_rate": 1.3464273134266037e-05, | |
| "loss": 1.7487, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 20.94662638469285, | |
| "grad_norm": 1.2116628885269165, | |
| "learning_rate": 1.3424211080208478e-05, | |
| "loss": 1.7388, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 21.011077542799597, | |
| "grad_norm": 1.281503438949585, | |
| "learning_rate": 1.338408667162404e-05, | |
| "loss": 1.8821, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 21.075528700906343, | |
| "grad_norm": 1.3089734315872192, | |
| "learning_rate": 1.3343900639174007e-05, | |
| "loss": 1.7235, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 21.139979859013092, | |
| "grad_norm": 1.1901700496673584, | |
| "learning_rate": 1.3303653714641853e-05, | |
| "loss": 1.7281, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 21.204431017119838, | |
| "grad_norm": 1.307697057723999, | |
| "learning_rate": 1.3263346630919875e-05, | |
| "loss": 1.7273, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 21.268882175226587, | |
| "grad_norm": 1.1000251770019531, | |
| "learning_rate": 1.3222980121995867e-05, | |
| "loss": 1.7264, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 21.268882175226587, | |
| "eval_loss": 1.7537308931350708, | |
| "eval_runtime": 20.2292, | |
| "eval_samples_per_second": 79.341, | |
| "eval_steps_per_second": 4.993, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 21.333333333333332, | |
| "grad_norm": 1.1835920810699463, | |
| "learning_rate": 1.3182554922939748e-05, | |
| "loss": 1.7096, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 21.39778449144008, | |
| "grad_norm": 1.3662201166152954, | |
| "learning_rate": 1.3142071769890182e-05, | |
| "loss": 1.716, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 21.462235649546827, | |
| "grad_norm": 1.1962809562683105, | |
| "learning_rate": 1.3101531400041163e-05, | |
| "loss": 1.7298, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 21.526686807653576, | |
| "grad_norm": 1.1254611015319824, | |
| "learning_rate": 1.3060934551628603e-05, | |
| "loss": 1.7316, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 21.59113796576032, | |
| "grad_norm": 1.4266749620437622, | |
| "learning_rate": 1.3020281963916883e-05, | |
| "loss": 1.7149, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 21.65558912386707, | |
| "grad_norm": 1.2904185056686401, | |
| "learning_rate": 1.2979574377185385e-05, | |
| "loss": 1.7324, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 21.720040281973816, | |
| "grad_norm": 1.357013463973999, | |
| "learning_rate": 1.293881253271502e-05, | |
| "loss": 1.7155, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 21.784491440080565, | |
| "grad_norm": 1.285480260848999, | |
| "learning_rate": 1.289799717277473e-05, | |
| "loss": 1.724, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 21.84894259818731, | |
| "grad_norm": 1.2539184093475342, | |
| "learning_rate": 1.2857129040607963e-05, | |
| "loss": 1.7297, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 21.91339375629406, | |
| "grad_norm": 1.2335858345031738, | |
| "learning_rate": 1.281620888041915e-05, | |
| "loss": 1.7112, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 21.91339375629406, | |
| "eval_loss": 1.738411784172058, | |
| "eval_runtime": 19.8069, | |
| "eval_samples_per_second": 81.032, | |
| "eval_steps_per_second": 5.099, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 21.977844914400805, | |
| "grad_norm": 1.2766244411468506, | |
| "learning_rate": 1.2775237437360137e-05, | |
| "loss": 1.6879, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 22.042296072507554, | |
| "grad_norm": 1.1971969604492188, | |
| "learning_rate": 1.2734215457516639e-05, | |
| "loss": 1.8451, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 22.1067472306143, | |
| "grad_norm": 1.2040929794311523, | |
| "learning_rate": 1.269314368789463e-05, | |
| "loss": 1.7007, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 22.17119838872105, | |
| "grad_norm": 1.2112030982971191, | |
| "learning_rate": 1.2652022876406756e-05, | |
| "loss": 1.7094, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 22.235649546827794, | |
| "grad_norm": 1.2944767475128174, | |
| "learning_rate": 1.2610853771858702e-05, | |
| "loss": 1.69, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 22.300100704934543, | |
| "grad_norm": 1.3381768465042114, | |
| "learning_rate": 1.2569637123935581e-05, | |
| "loss": 1.7046, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 22.36455186304129, | |
| "grad_norm": 1.205153465270996, | |
| "learning_rate": 1.2528373683188247e-05, | |
| "loss": 1.7066, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 22.429003021148038, | |
| "grad_norm": 1.2156826257705688, | |
| "learning_rate": 1.248706420101966e-05, | |
| "loss": 1.7052, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 22.493454179254783, | |
| "grad_norm": 1.2174049615859985, | |
| "learning_rate": 1.2445709429671184e-05, | |
| "loss": 1.688, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 22.557905337361532, | |
| "grad_norm": 1.1301804780960083, | |
| "learning_rate": 1.2404310122208895e-05, | |
| "loss": 1.7036, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 22.557905337361532, | |
| "eval_loss": 1.7260056734085083, | |
| "eval_runtime": 20.2022, | |
| "eval_samples_per_second": 79.447, | |
| "eval_steps_per_second": 4.999, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 22.622356495468278, | |
| "grad_norm": 1.2046083211898804, | |
| "learning_rate": 1.2362867032509871e-05, | |
| "loss": 1.7096, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 22.686807653575023, | |
| "grad_norm": 1.3079477548599243, | |
| "learning_rate": 1.2321380915248446e-05, | |
| "loss": 1.7013, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 22.751258811681772, | |
| "grad_norm": 1.0908950567245483, | |
| "learning_rate": 1.2279852525882504e-05, | |
| "loss": 1.6883, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 22.815709969788518, | |
| "grad_norm": 1.3112512826919556, | |
| "learning_rate": 1.2238282620639677e-05, | |
| "loss": 1.7084, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 22.880161127895267, | |
| "grad_norm": 1.1353329420089722, | |
| "learning_rate": 1.2196671956503611e-05, | |
| "loss": 1.6871, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 22.944612286002013, | |
| "grad_norm": 1.1687755584716797, | |
| "learning_rate": 1.2155021291200161e-05, | |
| "loss": 1.6921, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 23.00906344410876, | |
| "grad_norm": 1.0935124158859253, | |
| "learning_rate": 1.2113331383183607e-05, | |
| "loss": 1.8299, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 23.073514602215507, | |
| "grad_norm": 1.1443811655044556, | |
| "learning_rate": 1.2071602991622822e-05, | |
| "loss": 1.695, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 23.137965760322256, | |
| "grad_norm": 1.0561773777008057, | |
| "learning_rate": 1.202983687638747e-05, | |
| "loss": 1.6879, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 23.202416918429, | |
| "grad_norm": 1.1481385231018066, | |
| "learning_rate": 1.198803379803416e-05, | |
| "loss": 1.6766, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 23.202416918429, | |
| "eval_loss": 1.7131911516189575, | |
| "eval_runtime": 20.2073, | |
| "eval_samples_per_second": 79.427, | |
| "eval_steps_per_second": 4.998, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 23.26686807653575, | |
| "grad_norm": 1.1637824773788452, | |
| "learning_rate": 1.1946194517792584e-05, | |
| "loss": 1.6887, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 23.331319234642496, | |
| "grad_norm": 1.168934941291809, | |
| "learning_rate": 1.190431979755168e-05, | |
| "loss": 1.6692, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 23.395770392749245, | |
| "grad_norm": 1.1618574857711792, | |
| "learning_rate": 1.1862410399845739e-05, | |
| "loss": 1.6696, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 23.46022155085599, | |
| "grad_norm": 1.118910312652588, | |
| "learning_rate": 1.1820467087840526e-05, | |
| "loss": 1.6804, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 23.52467270896274, | |
| "grad_norm": 1.1120346784591675, | |
| "learning_rate": 1.1778490625319376e-05, | |
| "loss": 1.6863, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 23.589123867069485, | |
| "grad_norm": 1.1935482025146484, | |
| "learning_rate": 1.1736481776669307e-05, | |
| "loss": 1.6881, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 23.653575025176234, | |
| "grad_norm": 1.1648095846176147, | |
| "learning_rate": 1.1694441306867062e-05, | |
| "loss": 1.6813, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 23.71802618328298, | |
| "grad_norm": 1.0669279098510742, | |
| "learning_rate": 1.1652369981465218e-05, | |
| "loss": 1.6737, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 23.78247734138973, | |
| "grad_norm": 1.2142657041549683, | |
| "learning_rate": 1.1610268566578233e-05, | |
| "loss": 1.6825, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 23.846928499496475, | |
| "grad_norm": 1.05051851272583, | |
| "learning_rate": 1.1568137828868478e-05, | |
| "loss": 1.6606, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 23.846928499496475, | |
| "eval_loss": 1.7030967473983765, | |
| "eval_runtime": 20.2178, | |
| "eval_samples_per_second": 79.385, | |
| "eval_steps_per_second": 4.996, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 23.911379657603224, | |
| "grad_norm": 1.1508402824401855, | |
| "learning_rate": 1.15259785355323e-05, | |
| "loss": 1.6664, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 23.97583081570997, | |
| "grad_norm": 1.260407567024231, | |
| "learning_rate": 1.1483791454286027e-05, | |
| "loss": 1.6875, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 24.040281973816718, | |
| "grad_norm": 1.032926082611084, | |
| "learning_rate": 1.1441577353352023e-05, | |
| "loss": 1.7966, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 24.104733131923464, | |
| "grad_norm": 1.0912493467330933, | |
| "learning_rate": 1.1399337001444658e-05, | |
| "loss": 1.6737, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 24.169184290030213, | |
| "grad_norm": 1.026868462562561, | |
| "learning_rate": 1.1357071167756341e-05, | |
| "loss": 1.657, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 24.23363544813696, | |
| "grad_norm": 1.212310552597046, | |
| "learning_rate": 1.13147806219435e-05, | |
| "loss": 1.673, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 24.298086606243707, | |
| "grad_norm": 1.0902752876281738, | |
| "learning_rate": 1.1272466134112562e-05, | |
| "loss": 1.6793, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 24.362537764350453, | |
| "grad_norm": 1.1671861410140991, | |
| "learning_rate": 1.1230128474805948e-05, | |
| "loss": 1.664, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 24.426988922457202, | |
| "grad_norm": 1.0966593027114868, | |
| "learning_rate": 1.1187768414988015e-05, | |
| "loss": 1.6649, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 24.491440080563947, | |
| "grad_norm": 1.0915180444717407, | |
| "learning_rate": 1.114538672603104e-05, | |
| "loss": 1.6749, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 24.491440080563947, | |
| "eval_loss": 1.6930159330368042, | |
| "eval_runtime": 19.486, | |
| "eval_samples_per_second": 82.367, | |
| "eval_steps_per_second": 5.183, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 24.555891238670696, | |
| "grad_norm": 1.0787924528121948, | |
| "learning_rate": 1.1102984179701157e-05, | |
| "loss": 1.659, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 24.620342396777442, | |
| "grad_norm": 1.0481791496276855, | |
| "learning_rate": 1.1060561548144321e-05, | |
| "loss": 1.6558, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 24.68479355488419, | |
| "grad_norm": 1.06913161277771, | |
| "learning_rate": 1.1018119603872228e-05, | |
| "loss": 1.6551, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 24.749244712990937, | |
| "grad_norm": 0.9699168801307678, | |
| "learning_rate": 1.0975659119748265e-05, | |
| "loss": 1.6579, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 24.813695871097686, | |
| "grad_norm": 1.0122530460357666, | |
| "learning_rate": 1.0933180868973414e-05, | |
| "loss": 1.6517, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 24.87814702920443, | |
| "grad_norm": 0.935750424861908, | |
| "learning_rate": 1.08906856250722e-05, | |
| "loss": 1.6567, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 24.942598187311177, | |
| "grad_norm": 0.929023027420044, | |
| "learning_rate": 1.0848174161878584e-05, | |
| "loss": 1.6501, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 25.007049345417926, | |
| "grad_norm": 1.1005076169967651, | |
| "learning_rate": 1.080564725352188e-05, | |
| "loss": 1.8029, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 25.07150050352467, | |
| "grad_norm": 0.9921239018440247, | |
| "learning_rate": 1.076310567441266e-05, | |
| "loss": 1.6426, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 25.13595166163142, | |
| "grad_norm": 1.0867966413497925, | |
| "learning_rate": 1.072055019922864e-05, | |
| "loss": 1.6545, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 25.13595166163142, | |
| "eval_loss": 1.6841835975646973, | |
| "eval_runtime": 19.9451, | |
| "eval_samples_per_second": 80.471, | |
| "eval_steps_per_second": 5.064, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 25.200402819738166, | |
| "grad_norm": 0.9666146636009216, | |
| "learning_rate": 1.067798160290059e-05, | |
| "loss": 1.6428, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 25.264853977844915, | |
| "grad_norm": 1.1212939023971558, | |
| "learning_rate": 1.0635400660598214e-05, | |
| "loss": 1.6361, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 25.32930513595166, | |
| "grad_norm": 1.075994610786438, | |
| "learning_rate": 1.0592808147716032e-05, | |
| "loss": 1.6567, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 25.39375629405841, | |
| "grad_norm": 0.9849887490272522, | |
| "learning_rate": 1.0550204839859265e-05, | |
| "loss": 1.6587, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 25.458207452165155, | |
| "grad_norm": 0.932995617389679, | |
| "learning_rate": 1.0507591512829707e-05, | |
| "loss": 1.6471, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 25.522658610271904, | |
| "grad_norm": 1.0005803108215332, | |
| "learning_rate": 1.0464968942611608e-05, | |
| "loss": 1.6508, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 25.58710976837865, | |
| "grad_norm": 1.2026358842849731, | |
| "learning_rate": 1.0422337905357523e-05, | |
| "loss": 1.6506, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 25.6515609264854, | |
| "grad_norm": 1.075043797492981, | |
| "learning_rate": 1.0379699177374199e-05, | |
| "loss": 1.6372, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 25.716012084592144, | |
| "grad_norm": 0.983676016330719, | |
| "learning_rate": 1.0337053535108427e-05, | |
| "loss": 1.6494, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 25.780463242698893, | |
| "grad_norm": 0.9707674384117126, | |
| "learning_rate": 1.0294401755132912e-05, | |
| "loss": 1.6509, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 25.780463242698893, | |
| "eval_loss": 1.6760313510894775, | |
| "eval_runtime": 19.7916, | |
| "eval_samples_per_second": 81.095, | |
| "eval_steps_per_second": 5.103, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 25.84491440080564, | |
| "grad_norm": 1.0340772867202759, | |
| "learning_rate": 1.0251744614132117e-05, | |
| "loss": 1.6614, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 25.909365558912388, | |
| "grad_norm": 0.9816955924034119, | |
| "learning_rate": 1.0209082888888143e-05, | |
| "loss": 1.6327, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 25.973816717019133, | |
| "grad_norm": 0.9904563426971436, | |
| "learning_rate": 1.0166417356266546e-05, | |
| "loss": 1.6408, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 26.038267875125882, | |
| "grad_norm": 0.9868249893188477, | |
| "learning_rate": 1.0123748793202242e-05, | |
| "loss": 1.7511, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 26.102719033232628, | |
| "grad_norm": 0.9166064262390137, | |
| "learning_rate": 1.0081077976685307e-05, | |
| "loss": 1.6361, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 26.167170191339377, | |
| "grad_norm": 0.9603624939918518, | |
| "learning_rate": 1.0038405683746868e-05, | |
| "loss": 1.6374, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 26.231621349446122, | |
| "grad_norm": 0.9980469346046448, | |
| "learning_rate": 9.995732691444932e-06, | |
| "loss": 1.6464, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 26.29607250755287, | |
| "grad_norm": 1.0243116617202759, | |
| "learning_rate": 9.953059776850238e-06, | |
| "loss": 1.6364, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 26.360523665659617, | |
| "grad_norm": 0.9727787971496582, | |
| "learning_rate": 9.910387717032115e-06, | |
| "loss": 1.6366, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 26.424974823766366, | |
| "grad_norm": 0.9883456826210022, | |
| "learning_rate": 9.86771728904433e-06, | |
| "loss": 1.6423, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 26.424974823766366, | |
| "eval_loss": 1.6685707569122314, | |
| "eval_runtime": 21.0152, | |
| "eval_samples_per_second": 76.373, | |
| "eval_steps_per_second": 4.806, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 26.48942598187311, | |
| "grad_norm": 0.9553434252738953, | |
| "learning_rate": 9.82504926991092e-06, | |
| "loss": 1.6439, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 26.55387713997986, | |
| "grad_norm": 1.038282036781311, | |
| "learning_rate": 9.782384436612072e-06, | |
| "loss": 1.641, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 26.618328298086606, | |
| "grad_norm": 1.0809876918792725, | |
| "learning_rate": 9.73972356606995e-06, | |
| "loss": 1.6335, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 26.682779456193355, | |
| "grad_norm": 1.0616182088851929, | |
| "learning_rate": 9.697067435134564e-06, | |
| "loss": 1.6168, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 26.7472306143001, | |
| "grad_norm": 0.8756535649299622, | |
| "learning_rate": 9.654416820569618e-06, | |
| "loss": 1.6203, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 26.81168177240685, | |
| "grad_norm": 1.1267576217651367, | |
| "learning_rate": 9.611772499038345e-06, | |
| "loss": 1.6279, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 26.876132930513595, | |
| "grad_norm": 0.9867852926254272, | |
| "learning_rate": 9.569135247089401e-06, | |
| "loss": 1.6323, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 26.94058408862034, | |
| "grad_norm": 0.9736083149909973, | |
| "learning_rate": 9.526505841142702e-06, | |
| "loss": 1.6328, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 27.00503524672709, | |
| "grad_norm": 1.8718211650848389, | |
| "learning_rate": 9.48388505747529e-06, | |
| "loss": 1.7769, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 27.069486404833835, | |
| "grad_norm": 0.9595508575439453, | |
| "learning_rate": 9.441273672207187e-06, | |
| "loss": 1.6209, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 27.069486404833835, | |
| "eval_loss": 1.6613610982894897, | |
| "eval_runtime": 20.2063, | |
| "eval_samples_per_second": 79.431, | |
| "eval_steps_per_second": 4.998, | |
| "step": 4200 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3849836110034764e+20, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |