| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.999849510910459, |
| "eval_steps": 1000, |
| "global_step": 3322, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0030097817908201654, |
| "grad_norm": 0.5666791796684265, |
| "learning_rate": 6.006006006006006e-06, |
| "loss": 1.0909, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.006019563581640331, |
| "grad_norm": 0.5926806926727295, |
| "learning_rate": 1.2012012012012012e-05, |
| "loss": 1.0813, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.009029345372460496, |
| "grad_norm": 0.5680918097496033, |
| "learning_rate": 1.801801801801802e-05, |
| "loss": 1.051, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.012039127163280662, |
| "grad_norm": 0.4019787907600403, |
| "learning_rate": 2.4024024024024024e-05, |
| "loss": 0.9406, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.015048908954100828, |
| "grad_norm": 0.289296418428421, |
| "learning_rate": 3.0030030030030033e-05, |
| "loss": 0.9232, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01805869074492099, |
| "grad_norm": 0.23350511491298676, |
| "learning_rate": 3.603603603603604e-05, |
| "loss": 0.8762, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.021068472535741158, |
| "grad_norm": 0.17287714779376984, |
| "learning_rate": 4.204204204204204e-05, |
| "loss": 0.8496, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.024078254326561323, |
| "grad_norm": 0.16750749945640564, |
| "learning_rate": 4.804804804804805e-05, |
| "loss": 0.8424, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.02708803611738149, |
| "grad_norm": 0.15370824933052063, |
| "learning_rate": 5.405405405405406e-05, |
| "loss": 0.8601, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.030097817908201655, |
| "grad_norm": 0.16766728460788727, |
| "learning_rate": 6.0060060060060066e-05, |
| "loss": 0.8179, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03310759969902182, |
| "grad_norm": 0.2106652557849884, |
| "learning_rate": 6.606606606606607e-05, |
| "loss": 0.8094, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.03611738148984198, |
| "grad_norm": 0.22345463931560516, |
| "learning_rate": 7.207207207207208e-05, |
| "loss": 0.8096, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03912716328066215, |
| "grad_norm": 0.23186658322811127, |
| "learning_rate": 7.807807807807808e-05, |
| "loss": 0.8195, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.042136945071482315, |
| "grad_norm": 0.3011253774166107, |
| "learning_rate": 8.408408408408409e-05, |
| "loss": 0.8184, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.045146726862302484, |
| "grad_norm": 0.21401682496070862, |
| "learning_rate": 9.009009009009009e-05, |
| "loss": 0.8, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.04815650865312265, |
| "grad_norm": 0.2243528664112091, |
| "learning_rate": 9.60960960960961e-05, |
| "loss": 0.7739, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.051166290443942816, |
| "grad_norm": 0.46177345514297485, |
| "learning_rate": 0.00010210210210210212, |
| "loss": 0.8172, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.05417607223476298, |
| "grad_norm": 0.2084399163722992, |
| "learning_rate": 0.00010810810810810812, |
| "loss": 0.8181, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.05718585402558315, |
| "grad_norm": 0.24123747646808624, |
| "learning_rate": 0.00011411411411411413, |
| "loss": 0.8032, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.06019563581640331, |
| "grad_norm": 0.23546789586544037, |
| "learning_rate": 0.00012012012012012013, |
| "loss": 0.7775, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.06320541760722348, |
| "grad_norm": 0.23672956228256226, |
| "learning_rate": 0.00012612612612612612, |
| "loss": 0.821, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.06621519939804364, |
| "grad_norm": 0.22630153596401215, |
| "learning_rate": 0.00013213213213213214, |
| "loss": 0.7892, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0692249811888638, |
| "grad_norm": 0.2221691757440567, |
| "learning_rate": 0.00013813813813813813, |
| "loss": 0.7932, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.07223476297968397, |
| "grad_norm": 0.2853372097015381, |
| "learning_rate": 0.00014414414414414415, |
| "loss": 0.7993, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.07524454477050414, |
| "grad_norm": 0.24529512226581573, |
| "learning_rate": 0.00015015015015015014, |
| "loss": 0.8281, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0782543265613243, |
| "grad_norm": 0.20747515559196472, |
| "learning_rate": 0.00015615615615615616, |
| "loss": 0.7695, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.08126410835214447, |
| "grad_norm": 0.21736431121826172, |
| "learning_rate": 0.00016216216216216218, |
| "loss": 0.7863, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.08427389014296463, |
| "grad_norm": 0.25259608030319214, |
| "learning_rate": 0.00016816816816816817, |
| "loss": 0.7616, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0872836719337848, |
| "grad_norm": 0.19296181201934814, |
| "learning_rate": 0.0001741741741741742, |
| "loss": 0.7707, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.09029345372460497, |
| "grad_norm": 0.26890674233436584, |
| "learning_rate": 0.00018018018018018018, |
| "loss": 0.7744, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.09330323551542513, |
| "grad_norm": 0.2053564041852951, |
| "learning_rate": 0.0001861861861861862, |
| "loss": 0.743, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.0963130173062453, |
| "grad_norm": 0.20555047690868378, |
| "learning_rate": 0.0001921921921921922, |
| "loss": 0.8089, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.09932279909706546, |
| "grad_norm": 0.2425510585308075, |
| "learning_rate": 0.0001981981981981982, |
| "loss": 0.8012, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.10233258088788563, |
| "grad_norm": 0.2260735034942627, |
| "learning_rate": 0.00019999729347501484, |
| "loss": 0.7843, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.1053423626787058, |
| "grad_norm": 0.21512670814990997, |
| "learning_rate": 0.0001999840373787939, |
| "loss": 0.7992, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.10835214446952596, |
| "grad_norm": 0.22542227804660797, |
| "learning_rate": 0.0001999597360570722, |
| "loss": 0.772, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.11136192626034612, |
| "grad_norm": 0.15299277007579803, |
| "learning_rate": 0.0001999243921944139, |
| "loss": 0.7511, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.1143717080511663, |
| "grad_norm": 0.27176716923713684, |
| "learning_rate": 0.00019987800969525164, |
| "loss": 0.7735, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.11738148984198646, |
| "grad_norm": 0.28357240557670593, |
| "learning_rate": 0.00019982059368345496, |
| "loss": 0.7729, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.12039127163280662, |
| "grad_norm": 0.18754735589027405, |
| "learning_rate": 0.00019975215050176433, |
| "loss": 0.795, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.12340105342362678, |
| "grad_norm": 0.15982478857040405, |
| "learning_rate": 0.00019967268771109035, |
| "loss": 0.7671, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.12641083521444696, |
| "grad_norm": 0.18540535867214203, |
| "learning_rate": 0.00019958221408967875, |
| "loss": 0.7491, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.1294206170052671, |
| "grad_norm": 0.4123195707798004, |
| "learning_rate": 0.00019948073963214043, |
| "loss": 0.7766, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.13243039879608728, |
| "grad_norm": 0.1608065813779831, |
| "learning_rate": 0.00019936827554834738, |
| "loss": 0.7904, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.13544018058690746, |
| "grad_norm": 0.21908819675445557, |
| "learning_rate": 0.00019924483426219452, |
| "loss": 0.7816, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1384499623777276, |
| "grad_norm": 0.17256338894367218, |
| "learning_rate": 0.00019911042941022695, |
| "loss": 0.7982, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.14145974416854779, |
| "grad_norm": 0.15286709368228912, |
| "learning_rate": 0.00019896507584013376, |
| "loss": 0.793, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.14446952595936793, |
| "grad_norm": 0.1949639767408371, |
| "learning_rate": 0.00019880878960910772, |
| "loss": 0.745, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.1474793077501881, |
| "grad_norm": 0.22766350209712982, |
| "learning_rate": 0.00019864158798207137, |
| "loss": 0.765, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.1504890895410083, |
| "grad_norm": 0.17246346175670624, |
| "learning_rate": 0.0001984634894297699, |
| "loss": 0.7494, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.15349887133182843, |
| "grad_norm": 0.18809442222118378, |
| "learning_rate": 0.00019827451362673052, |
| "loss": 0.7906, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.1565086531226486, |
| "grad_norm": 0.287610799074173, |
| "learning_rate": 0.00019807468144908928, |
| "loss": 0.7991, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.1595184349134688, |
| "grad_norm": 0.2345225214958191, |
| "learning_rate": 0.00019786401497228466, |
| "loss": 0.7593, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.16252821670428894, |
| "grad_norm": 0.24265146255493164, |
| "learning_rate": 0.00019764253746861886, |
| "loss": 0.7966, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.1655379984951091, |
| "grad_norm": 0.1692132204771042, |
| "learning_rate": 0.00019741027340468715, |
| "loss": 0.7525, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.16854778028592926, |
| "grad_norm": 0.18456414341926575, |
| "learning_rate": 0.00019716724843867487, |
| "loss": 0.7706, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.17155756207674944, |
| "grad_norm": 0.2189481407403946, |
| "learning_rate": 0.000196913489417523, |
| "loss": 0.7683, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.1745673438675696, |
| "grad_norm": 0.24484622478485107, |
| "learning_rate": 0.00019664902437396245, |
| "loss": 0.821, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.17757712565838976, |
| "grad_norm": 0.1760026067495346, |
| "learning_rate": 0.00019637388252341715, |
| "loss": 0.7744, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.18058690744920994, |
| "grad_norm": 0.18553385138511658, |
| "learning_rate": 0.00019608809426077678, |
| "loss": 0.7607, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1835966892400301, |
| "grad_norm": 0.23498745262622833, |
| "learning_rate": 0.000195791691157039, |
| "loss": 0.7707, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.18660647103085026, |
| "grad_norm": 0.20103472471237183, |
| "learning_rate": 0.00019548470595582166, |
| "loss": 0.7487, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.18961625282167044, |
| "grad_norm": 0.18829959630966187, |
| "learning_rate": 0.00019516717256974592, |
| "loss": 0.7653, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.1926260346124906, |
| "grad_norm": 0.24024085700511932, |
| "learning_rate": 0.00019483912607668965, |
| "loss": 0.7918, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.19563581640331076, |
| "grad_norm": 0.18580889701843262, |
| "learning_rate": 0.00019450060271591243, |
| "loss": 0.8022, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.1986455981941309, |
| "grad_norm": 0.2515565752983093, |
| "learning_rate": 0.0001941516398840524, |
| "loss": 0.7548, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.2016553799849511, |
| "grad_norm": 0.18456920981407166, |
| "learning_rate": 0.00019379227613099473, |
| "loss": 0.7903, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.20466516177577126, |
| "grad_norm": 0.23653873801231384, |
| "learning_rate": 0.00019342255115561337, |
| "loss": 0.7917, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.2076749435665914, |
| "grad_norm": 0.26416492462158203, |
| "learning_rate": 0.00019304250580138524, |
| "loss": 0.7879, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.2106847253574116, |
| "grad_norm": 0.22712339460849762, |
| "learning_rate": 0.0001926521820518784, |
| "loss": 0.8014, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21369450714823177, |
| "grad_norm": 0.17389413714408875, |
| "learning_rate": 0.00019225162302611412, |
| "loss": 0.7403, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.21670428893905191, |
| "grad_norm": 0.23703350126743317, |
| "learning_rate": 0.00019184087297380344, |
| "loss": 0.7795, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2197140707298721, |
| "grad_norm": 0.20507369935512543, |
| "learning_rate": 0.000191419977270459, |
| "loss": 0.8047, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.22272385252069224, |
| "grad_norm": 0.1989058554172516, |
| "learning_rate": 0.0001909889824123824, |
| "loss": 0.7786, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.22573363431151242, |
| "grad_norm": 0.1992356926202774, |
| "learning_rate": 0.00019054793601152773, |
| "loss": 0.7352, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2287434161023326, |
| "grad_norm": 0.1841665804386139, |
| "learning_rate": 0.0001900968867902419, |
| "loss": 0.778, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.23175319789315274, |
| "grad_norm": 0.1729171723127365, |
| "learning_rate": 0.00018963588457588228, |
| "loss": 0.7776, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.23476297968397292, |
| "grad_norm": 0.18624809384346008, |
| "learning_rate": 0.00018916498029531223, |
| "loss": 0.7841, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.23777276147479307, |
| "grad_norm": 0.19757080078125, |
| "learning_rate": 0.00018868422596927535, |
| "loss": 0.7628, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.24078254326561324, |
| "grad_norm": 0.1995578110218048, |
| "learning_rate": 0.00018819367470664862, |
| "loss": 0.7403, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24379232505643342, |
| "grad_norm": 0.2151431441307068, |
| "learning_rate": 0.00018769338069857548, |
| "loss": 0.7581, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.24680210684725357, |
| "grad_norm": 0.2272290140390396, |
| "learning_rate": 0.00018718339921247945, |
| "loss": 0.7914, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.24981188863807374, |
| "grad_norm": 0.1463918834924698, |
| "learning_rate": 0.0001866637865859586, |
| "loss": 0.7953, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.2528216704288939, |
| "grad_norm": 0.2657776176929474, |
| "learning_rate": 0.00018613460022056215, |
| "loss": 0.7576, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.2558314522197141, |
| "grad_norm": 0.2566029131412506, |
| "learning_rate": 0.000185595898575449, |
| "loss": 0.7508, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2588412340105342, |
| "grad_norm": 0.2356068342924118, |
| "learning_rate": 0.00018504774116093008, |
| "loss": 0.7332, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.2618510158013544, |
| "grad_norm": 0.20745734870433807, |
| "learning_rate": 0.00018449018853189403, |
| "loss": 0.756, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.26486079759217457, |
| "grad_norm": 0.1839013397693634, |
| "learning_rate": 0.0001839233022811179, |
| "loss": 0.7776, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.26787057938299474, |
| "grad_norm": 0.238608717918396, |
| "learning_rate": 0.00018334714503246273, |
| "loss": 0.7771, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.2708803611738149, |
| "grad_norm": 0.17599129676818848, |
| "learning_rate": 0.00018276178043395586, |
| "loss": 0.7544, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.27389014296463504, |
| "grad_norm": 0.23767121136188507, |
| "learning_rate": 0.00018216727315075945, |
| "loss": 0.7946, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2768999247554552, |
| "grad_norm": 0.180665984749794, |
| "learning_rate": 0.00018156368885802695, |
| "loss": 0.8202, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.2799097065462754, |
| "grad_norm": 0.20493340492248535, |
| "learning_rate": 0.00018095109423364817, |
| "loss": 0.7823, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.28291948833709557, |
| "grad_norm": 0.29762348532676697, |
| "learning_rate": 0.0001803295569508832, |
| "loss": 0.7637, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.28592927012791575, |
| "grad_norm": 0.2135084718465805, |
| "learning_rate": 0.0001796991456708866, |
| "loss": 0.768, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.28893905191873587, |
| "grad_norm": 0.21105533838272095, |
| "learning_rate": 0.0001790599300351225, |
| "loss": 0.7492, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.29194883370955604, |
| "grad_norm": 0.2163008451461792, |
| "learning_rate": 0.00017841198065767107, |
| "loss": 0.758, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.2949586155003762, |
| "grad_norm": 0.16340641677379608, |
| "learning_rate": 0.00017775536911742806, |
| "loss": 0.7739, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.2979683972911964, |
| "grad_norm": 0.20751433074474335, |
| "learning_rate": 0.00017709016795019742, |
| "loss": 0.7692, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.3009781790820166, |
| "grad_norm": 0.1802573949098587, |
| "learning_rate": 0.00017641645064067816, |
| "loss": 0.7886, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3009781790820166, |
| "eval_loss": 0.8210044503211975, |
| "eval_runtime": 143.0609, |
| "eval_samples_per_second": 39.116, |
| "eval_steps_per_second": 4.893, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3039879608728367, |
| "grad_norm": 0.17218339443206787, |
| "learning_rate": 0.0001757342916143466, |
| "loss": 0.7595, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.30699774266365687, |
| "grad_norm": 0.3174282908439636, |
| "learning_rate": 0.00017504376622923465, |
| "loss": 0.7821, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.31000752445447705, |
| "grad_norm": 0.1862519532442093, |
| "learning_rate": 0.00017434495076760483, |
| "loss": 0.7982, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.3130173062452972, |
| "grad_norm": 0.19561271369457245, |
| "learning_rate": 0.00017363792242752353, |
| "loss": 0.7422, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.3160270880361174, |
| "grad_norm": 0.24750228226184845, |
| "learning_rate": 0.000172922759314333, |
| "loss": 0.7425, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3190368698269376, |
| "grad_norm": 0.22299301624298096, |
| "learning_rate": 0.0001721995404320228, |
| "loss": 0.7392, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3220466516177577, |
| "grad_norm": 0.38551756739616394, |
| "learning_rate": 0.0001714683456745026, |
| "loss": 0.7913, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.32505643340857787, |
| "grad_norm": 0.19068562984466553, |
| "learning_rate": 0.00017072925581677594, |
| "loss": 0.7368, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.32806621519939805, |
| "grad_norm": 0.14717283844947815, |
| "learning_rate": 0.0001699823525060174, |
| "loss": 0.7938, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.3310759969902182, |
| "grad_norm": 0.20433548092842102, |
| "learning_rate": 0.00016922771825255263, |
| "loss": 0.7672, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3340857787810384, |
| "grad_norm": 0.24607650935649872, |
| "learning_rate": 0.0001684654364207438, |
| "loss": 0.7971, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.3370955605718585, |
| "grad_norm": 0.1704825758934021, |
| "learning_rate": 0.00016769559121978026, |
| "loss": 0.7283, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.3401053423626787, |
| "grad_norm": 0.18218447268009186, |
| "learning_rate": 0.0001669182676943757, |
| "loss": 0.7405, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.3431151241534989, |
| "grad_norm": 0.25802308320999146, |
| "learning_rate": 0.0001661335517153737, |
| "loss": 0.7821, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.34612490594431905, |
| "grad_norm": 0.2192961871623993, |
| "learning_rate": 0.00016534152997026125, |
| "loss": 0.7392, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3491346877351392, |
| "grad_norm": 0.19438660144805908, |
| "learning_rate": 0.00016454228995359252, |
| "loss": 0.7928, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.35214446952595935, |
| "grad_norm": 0.16023948788642883, |
| "learning_rate": 0.00016373591995732338, |
| "loss": 0.7542, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.3551542513167795, |
| "grad_norm": 0.32932865619659424, |
| "learning_rate": 0.0001629225090610577, |
| "loss": 0.7766, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3581640331075997, |
| "grad_norm": 0.282248854637146, |
| "learning_rate": 0.00016210214712220687, |
| "loss": 0.7528, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.3611738148984199, |
| "grad_norm": 0.2124871164560318, |
| "learning_rate": 0.00016127492476606308, |
| "loss": 0.7874, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36418359668924005, |
| "grad_norm": 0.23827847838401794, |
| "learning_rate": 0.00016044093337578815, |
| "loss": 0.7599, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.3671933784800602, |
| "grad_norm": 0.22505390644073486, |
| "learning_rate": 0.00015960026508231824, |
| "loss": 0.7707, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.37020316027088035, |
| "grad_norm": 0.17258504033088684, |
| "learning_rate": 0.00015875301275418638, |
| "loss": 0.8102, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.3732129420617005, |
| "grad_norm": 0.24378369748592377, |
| "learning_rate": 0.00015789926998726315, |
| "loss": 0.7388, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.3762227238525207, |
| "grad_norm": 0.22882601618766785, |
| "learning_rate": 0.00015703913109441713, |
| "loss": 0.7707, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3792325056433409, |
| "grad_norm": 0.20203706622123718, |
| "learning_rate": 0.0001561726910950962, |
| "loss": 0.7444, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.382242287434161, |
| "grad_norm": 0.25836071372032166, |
| "learning_rate": 0.00015530004570483093, |
| "loss": 0.7838, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.3852520692249812, |
| "grad_norm": 0.16395032405853271, |
| "learning_rate": 0.00015442129132466054, |
| "loss": 0.7281, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.38826185101580135, |
| "grad_norm": 0.19731546938419342, |
| "learning_rate": 0.00015353652503048384, |
| "loss": 0.7471, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.3912716328066215, |
| "grad_norm": 0.23747088015079498, |
| "learning_rate": 0.00015264584456233502, |
| "loss": 0.7469, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3942814145974417, |
| "grad_norm": 0.222218319773674, |
| "learning_rate": 0.0001517493483135864, |
| "loss": 0.7945, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.3972911963882618, |
| "grad_norm": 0.21308743953704834, |
| "learning_rate": 0.00015084713532007905, |
| "loss": 0.7659, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.400300978179082, |
| "grad_norm": 0.25059592723846436, |
| "learning_rate": 0.00014993930524918208, |
| "loss": 0.7718, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4033107599699022, |
| "grad_norm": 0.2029498815536499, |
| "learning_rate": 0.00014902595838878256, |
| "loss": 0.759, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.40632054176072235, |
| "grad_norm": 0.26108860969543457, |
| "learning_rate": 0.0001481071956362067, |
| "loss": 0.7568, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.40933032355154253, |
| "grad_norm": 0.16724595427513123, |
| "learning_rate": 0.0001471831184870737, |
| "loss": 0.7504, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.4123401053423627, |
| "grad_norm": 0.2351473867893219, |
| "learning_rate": 0.00014625382902408356, |
| "loss": 0.7365, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.4153498871331828, |
| "grad_norm": 0.20350322127342224, |
| "learning_rate": 0.00014531942990573998, |
| "loss": 0.7444, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.418359668924003, |
| "grad_norm": 0.2595962882041931, |
| "learning_rate": 0.00014438002435500979, |
| "loss": 0.7574, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.4213694507148232, |
| "grad_norm": 0.28216540813446045, |
| "learning_rate": 0.0001434357161479198, |
| "loss": 0.74, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.42437923250564336, |
| "grad_norm": 0.19514194130897522, |
| "learning_rate": 0.0001424866096020927, |
| "loss": 0.7761, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.42738901429646353, |
| "grad_norm": 0.2227347493171692, |
| "learning_rate": 0.00014153280956522322, |
| "loss": 0.7895, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.43039879608728365, |
| "grad_norm": 0.21557843685150146, |
| "learning_rate": 0.00014057442140349543, |
| "loss": 0.794, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.43340857787810383, |
| "grad_norm": 0.1980540156364441, |
| "learning_rate": 0.00013961155098994309, |
| "loss": 0.7471, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.436418359668924, |
| "grad_norm": 0.2989167869091034, |
| "learning_rate": 0.00013864430469275377, |
| "loss": 0.745, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4394281414597442, |
| "grad_norm": 0.2816371023654938, |
| "learning_rate": 0.00013767278936351854, |
| "loss": 0.7392, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.44243792325056436, |
| "grad_norm": 0.20301824808120728, |
| "learning_rate": 0.00013669711232542776, |
| "loss": 0.7486, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.4454477050413845, |
| "grad_norm": 0.18164554238319397, |
| "learning_rate": 0.00013571738136141555, |
| "loss": 0.7571, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.44845748683220465, |
| "grad_norm": 0.20167161524295807, |
| "learning_rate": 0.0001347337047022526, |
| "loss": 0.76, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.45146726862302483, |
| "grad_norm": 0.18913504481315613, |
| "learning_rate": 0.00013374619101459012, |
| "loss": 0.7444, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.454477050413845, |
| "grad_norm": 0.2532965838909149, |
| "learning_rate": 0.00013275494938895556, |
| "loss": 0.7755, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.4574868322046652, |
| "grad_norm": 0.16350123286247253, |
| "learning_rate": 0.00013176008932770113, |
| "loss": 0.755, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.4604966139954853, |
| "grad_norm": 0.18730787932872772, |
| "learning_rate": 0.00013076172073290724, |
| "loss": 0.7369, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.4635063957863055, |
| "grad_norm": 0.21689856052398682, |
| "learning_rate": 0.00012975995389424166, |
| "loss": 0.7773, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.46651617757712566, |
| "grad_norm": 0.21407072246074677, |
| "learning_rate": 0.0001287548994767758, |
| "loss": 0.7287, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.46952595936794583, |
| "grad_norm": 0.2083187699317932, |
| "learning_rate": 0.00012774666850875942, |
| "loss": 0.7717, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.472535741158766, |
| "grad_norm": 0.2223493456840515, |
| "learning_rate": 0.00012673537236935556, |
| "loss": 0.7613, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.47554552294958613, |
| "grad_norm": 0.2172088325023651, |
| "learning_rate": 0.00012572112277633649, |
| "loss": 0.7602, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.4785553047404063, |
| "grad_norm": 0.22197513282299042, |
| "learning_rate": 0.0001247040317737419, |
| "loss": 0.7241, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.4815650865312265, |
| "grad_norm": 0.1837187558412552, |
| "learning_rate": 0.00012368421171950192, |
| "loss": 0.7313, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.48457486832204666, |
| "grad_norm": 0.23295271396636963, |
| "learning_rate": 0.00012266177527302472, |
| "loss": 0.7432, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.48758465011286684, |
| "grad_norm": 0.21737614274024963, |
| "learning_rate": 0.0001216368353827508, |
| "loss": 0.7599, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.49059443190368696, |
| "grad_norm": 0.1803632527589798, |
| "learning_rate": 0.00012060950527367603, |
| "loss": 0.7386, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.49360421369450713, |
| "grad_norm": 0.1795688271522522, |
| "learning_rate": 0.00011957989843484345, |
| "loss": 0.7548, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.4966139954853273, |
| "grad_norm": 0.2615033984184265, |
| "learning_rate": 0.00011854812860680613, |
| "loss": 0.7838, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.4996237772761475, |
| "grad_norm": 0.22325466573238373, |
| "learning_rate": 0.00011751430976906233, |
| "loss": 0.7492, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5026335590669676, |
| "grad_norm": 0.20187042653560638, |
| "learning_rate": 0.00011647855612746423, |
| "loss": 0.7897, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5056433408577878, |
| "grad_norm": 0.3646783232688904, |
| "learning_rate": 0.00011544098210160152, |
| "loss": 0.7847, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.508653122648608, |
| "grad_norm": 0.23362590372562408, |
| "learning_rate": 0.00011440170231216154, |
| "loss": 0.7624, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.5116629044394282, |
| "grad_norm": 0.2684471011161804, |
| "learning_rate": 0.00011336083156826722, |
| "loss": 0.7973, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5146726862302483, |
| "grad_norm": 0.15820720791816711, |
| "learning_rate": 0.00011231848485479395, |
| "loss": 0.7322, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.5176824680210684, |
| "grad_norm": 0.3005324900150299, |
| "learning_rate": 0.00011127477731966735, |
| "loss": 0.7449, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.5206922498118887, |
| "grad_norm": 0.19756928086280823, |
| "learning_rate": 0.00011022982426114292, |
| "loss": 0.7988, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.5237020316027088, |
| "grad_norm": 0.20102205872535706, |
| "learning_rate": 0.00010918374111506893, |
| "loss": 0.7273, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.526711813393529, |
| "grad_norm": 0.2193961888551712, |
| "learning_rate": 0.00010813664344213427, |
| "loss": 0.7367, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5297215951843491, |
| "grad_norm": 0.2881150245666504, |
| "learning_rate": 0.00010708864691510254, |
| "loss": 0.7702, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.5327313769751693, |
| "grad_norm": 0.24800467491149902, |
| "learning_rate": 0.00010603986730603368, |
| "loss": 0.7853, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.5357411587659895, |
| "grad_norm": 0.18105538189411163, |
| "learning_rate": 0.00010499042047349455, |
| "loss": 0.7576, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.5387509405568096, |
| "grad_norm": 0.16151605546474457, |
| "learning_rate": 0.00010394042234976016, |
| "loss": 0.7363, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.5417607223476298, |
| "grad_norm": 0.2590145170688629, |
| "learning_rate": 0.00010288998892800657, |
| "loss": 0.7501, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.54477050413845, |
| "grad_norm": 0.22977079451084137, |
| "learning_rate": 0.0001018392362494972, |
| "loss": 0.7768, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.5477802859292701, |
| "grad_norm": 0.23535679280757904, |
| "learning_rate": 0.00010078828039076367, |
| "loss": 0.7803, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.5507900677200903, |
| "grad_norm": 0.43004941940307617, |
| "learning_rate": 9.973723745078296e-05, |
| "loss": 0.7686, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.5537998495109104, |
| "grad_norm": 0.25901973247528076, |
| "learning_rate": 9.868622353815188e-05, |
| "loss": 0.7623, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.5568096313017307, |
| "grad_norm": 0.2264987975358963, |
| "learning_rate": 9.763535475826054e-05, |
| "loss": 0.7439, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5598194130925508, |
| "grad_norm": 0.24374960362911224, |
| "learning_rate": 9.658474720046637e-05, |
| "loss": 0.7825, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.5628291948833709, |
| "grad_norm": 0.20084655284881592, |
| "learning_rate": 9.553451692526954e-05, |
| "loss": 0.7802, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.5658389766741911, |
| "grad_norm": 0.1831783950328827, |
| "learning_rate": 9.448477995149182e-05, |
| "loss": 0.7328, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.5688487584650113, |
| "grad_norm": 0.18079884350299835, |
| "learning_rate": 9.343565224346013e-05, |
| "loss": 0.7464, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.5718585402558315, |
| "grad_norm": 0.248003751039505, |
| "learning_rate": 9.238724969819579e-05, |
| "loss": 0.7387, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5748683220466516, |
| "grad_norm": 0.18932189047336578, |
| "learning_rate": 9.13396881326115e-05, |
| "loss": 0.7136, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.5778781038374717, |
| "grad_norm": 0.20788271725177765, |
| "learning_rate": 9.029308327071702e-05, |
| "loss": 0.7702, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.580887885628292, |
| "grad_norm": 0.19441328942775726, |
| "learning_rate": 8.924755073083517e-05, |
| "loss": 0.7901, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.5838976674191121, |
| "grad_norm": 0.21718983352184296, |
| "learning_rate": 8.820320601282949e-05, |
| "loss": 0.7771, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.5869074492099323, |
| "grad_norm": 0.19140474498271942, |
| "learning_rate": 8.71601644853449e-05, |
| "loss": 0.7601, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5899172310007524, |
| "grad_norm": 0.26861098408699036, |
| "learning_rate": 8.61185413730631e-05, |
| "loss": 0.7575, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.5929270127915726, |
| "grad_norm": 0.19967325031757355, |
| "learning_rate": 8.507845174397357e-05, |
| "loss": 0.8136, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.5959367945823928, |
| "grad_norm": 0.19900915026664734, |
| "learning_rate": 8.404001049666211e-05, |
| "loss": 0.7205, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.5989465763732129, |
| "grad_norm": 0.25308361649513245, |
| "learning_rate": 8.300333234761787e-05, |
| "loss": 0.768, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.6019563581640331, |
| "grad_norm": 0.16838368773460388, |
| "learning_rate": 8.196853181856081e-05, |
| "loss": 0.7502, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6019563581640331, |
| "eval_loss": 0.8055371046066284, |
| "eval_runtime": 143.0386, |
| "eval_samples_per_second": 39.122, |
| "eval_steps_per_second": 4.894, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6049661399548533, |
| "grad_norm": 0.22218653559684753, |
| "learning_rate": 8.093572322379045e-05, |
| "loss": 0.7597, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.6079759217456734, |
| "grad_norm": 0.19360238313674927, |
| "learning_rate": 7.990502065755748e-05, |
| "loss": 0.7592, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.6109857035364936, |
| "grad_norm": 0.25584983825683594, |
| "learning_rate": 7.887653798145987e-05, |
| "loss": 0.7432, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.6139954853273137, |
| "grad_norm": 0.23388119041919708, |
| "learning_rate": 7.785038881186462e-05, |
| "loss": 0.7638, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.617005267118134, |
| "grad_norm": 0.24461683630943298, |
| "learning_rate": 7.682668650735645e-05, |
| "loss": 0.7451, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6200150489089541, |
| "grad_norm": 0.20549526810646057, |
| "learning_rate": 7.580554415621522e-05, |
| "loss": 0.7432, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.6230248306997742, |
| "grad_norm": 0.22572945058345795, |
| "learning_rate": 7.478707456392302e-05, |
| "loss": 0.7924, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.6260346124905944, |
| "grad_norm": 0.33990249037742615, |
| "learning_rate": 7.377139024070254e-05, |
| "loss": 0.7471, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.6290443942814146, |
| "grad_norm": 0.2138250321149826, |
| "learning_rate": 7.275860338908815e-05, |
| "loss": 0.7535, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.6320541760722348, |
| "grad_norm": 0.17337530851364136, |
| "learning_rate": 7.174882589153076e-05, |
| "loss": 0.7523, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6350639578630549, |
| "grad_norm": 0.2286282777786255, |
| "learning_rate": 7.07421692980384e-05, |
| "loss": 0.7507, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.6380737396538751, |
| "grad_norm": 0.19162622094154358, |
| "learning_rate": 6.973874481385312e-05, |
| "loss": 0.7671, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.6410835214446953, |
| "grad_norm": 0.22725722193717957, |
| "learning_rate": 6.873866328716614e-05, |
| "loss": 0.7595, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.6440933032355154, |
| "grad_norm": 0.28031495213508606, |
| "learning_rate": 6.774203519687265e-05, |
| "loss": 0.7384, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.6471030850263356, |
| "grad_norm": 0.31639155745506287, |
| "learning_rate": 6.674897064036706e-05, |
| "loss": 0.7629, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6501128668171557, |
| "grad_norm": 0.2253294438123703, |
| "learning_rate": 6.575957932138057e-05, |
| "loss": 0.7793, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.653122648607976, |
| "grad_norm": 0.2402772158384323, |
| "learning_rate": 6.47739705378623e-05, |
| "loss": 0.7586, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.6561324303987961, |
| "grad_norm": 0.1558917760848999, |
| "learning_rate": 6.379225316990505e-05, |
| "loss": 0.7564, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.6591422121896162, |
| "grad_norm": 0.17613032460212708, |
| "learning_rate": 6.281453566771735e-05, |
| "loss": 0.7354, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.6621519939804364, |
| "grad_norm": 0.28283971548080444, |
| "learning_rate": 6.184092603964308e-05, |
| "loss": 0.8203, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6651617757712566, |
| "grad_norm": 0.2295006364583969, |
| "learning_rate": 6.087153184022969e-05, |
| "loss": 0.7466, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.6681715575620768, |
| "grad_norm": 0.24166414141654968, |
| "learning_rate": 5.990646015834668e-05, |
| "loss": 0.76, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.6711813393528969, |
| "grad_norm": 0.5943595170974731, |
| "learning_rate": 5.894581760535549e-05, |
| "loss": 0.7662, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.674191121143717, |
| "grad_norm": 0.23785801231861115, |
| "learning_rate": 5.798971030333227e-05, |
| "loss": 0.7742, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.6772009029345373, |
| "grad_norm": 0.20206815004348755, |
| "learning_rate": 5.703824387334442e-05, |
| "loss": 0.7837, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6802106847253574, |
| "grad_norm": 0.21082885563373566, |
| "learning_rate": 5.609152342378278e-05, |
| "loss": 0.758, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.6832204665161776, |
| "grad_norm": 0.1953180432319641, |
| "learning_rate": 5.514965353875019e-05, |
| "loss": 0.726, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.6862302483069977, |
| "grad_norm": 0.22035697102546692, |
| "learning_rate": 5.4212738266508245e-05, |
| "loss": 0.7199, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.6892400300978179, |
| "grad_norm": 0.17208510637283325, |
| "learning_rate": 5.3280881107982946e-05, |
| "loss": 0.7379, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.6922498118886381, |
| "grad_norm": 0.18434813618659973, |
| "learning_rate": 5.235418500533109e-05, |
| "loss": 0.7574, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6952595936794582, |
| "grad_norm": 0.35970667004585266, |
| "learning_rate": 5.143275233056817e-05, |
| "loss": 0.7737, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.6982693754702785, |
| "grad_norm": 0.19193576276302338, |
| "learning_rate": 5.051668487425938e-05, |
| "loss": 0.7268, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.7012791572610986, |
| "grad_norm": 0.21549300849437714, |
| "learning_rate": 4.960608383427481e-05, |
| "loss": 0.7542, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.7042889390519187, |
| "grad_norm": 0.1781996786594391, |
| "learning_rate": 4.8701049804610265e-05, |
| "loss": 0.7493, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.7072987208427389, |
| "grad_norm": 0.17246761918067932, |
| "learning_rate": 4.780168276427441e-05, |
| "loss": 0.7468, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.710308502633559, |
| "grad_norm": 0.22511474788188934, |
| "learning_rate": 4.6908082066244275e-05, |
| "loss": 0.7667, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.7133182844243793, |
| "grad_norm": 0.22506214678287506, |
| "learning_rate": 4.602034642648968e-05, |
| "loss": 0.788, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.7163280662151994, |
| "grad_norm": 0.23567315936088562, |
| "learning_rate": 4.513857391306812e-05, |
| "loss": 0.772, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.7193378480060195, |
| "grad_norm": 0.21594233810901642, |
| "learning_rate": 4.4262861935291144e-05, |
| "loss": 0.7257, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.7223476297968398, |
| "grad_norm": 0.2095227688550949, |
| "learning_rate": 4.339330723296373e-05, |
| "loss": 0.7456, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7253574115876599, |
| "grad_norm": 0.2402193695306778, |
| "learning_rate": 4.25300058656972e-05, |
| "loss": 0.7722, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.7283671933784801, |
| "grad_norm": 0.2426275908946991, |
| "learning_rate": 4.1673053202297676e-05, |
| "loss": 0.7871, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.7313769751693002, |
| "grad_norm": 0.1813143640756607, |
| "learning_rate": 4.0822543910230674e-05, |
| "loss": 0.7394, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.7343867569601203, |
| "grad_norm": 0.29781273007392883, |
| "learning_rate": 3.997857194516319e-05, |
| "loss": 0.7318, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.7373965387509406, |
| "grad_norm": 0.2380327582359314, |
| "learning_rate": 3.914123054058446e-05, |
| "loss": 0.7711, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7404063205417607, |
| "grad_norm": 0.2532583773136139, |
| "learning_rate": 3.831061219750636e-05, |
| "loss": 0.7341, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.7434161023325809, |
| "grad_norm": 0.18625672161579132, |
| "learning_rate": 3.7486808674245047e-05, |
| "loss": 0.7349, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.746425884123401, |
| "grad_norm": 0.2749151289463043, |
| "learning_rate": 3.666991097628416e-05, |
| "loss": 0.7551, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.7494356659142212, |
| "grad_norm": 0.28592735528945923, |
| "learning_rate": 3.586000934622166e-05, |
| "loss": 0.7485, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.7524454477050414, |
| "grad_norm": 0.22789210081100464, |
| "learning_rate": 3.5057193253800624e-05, |
| "loss": 0.7308, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7554552294958615, |
| "grad_norm": 0.2588096559047699, |
| "learning_rate": 3.426155138602558e-05, |
| "loss": 0.7717, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.7584650112866818, |
| "grad_norm": 0.2197035402059555, |
| "learning_rate": 3.347317163736524e-05, |
| "loss": 0.753, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.7614747930775019, |
| "grad_norm": 0.19545480608940125, |
| "learning_rate": 3.269214110004293e-05, |
| "loss": 0.7552, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.764484574868322, |
| "grad_norm": 0.1901889145374298, |
| "learning_rate": 3.191854605441527e-05, |
| "loss": 0.7146, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.7674943566591422, |
| "grad_norm": 0.19916728138923645, |
| "learning_rate": 3.115247195944102e-05, |
| "loss": 0.7733, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7705041384499624, |
| "grad_norm": 0.2248535454273224, |
| "learning_rate": 3.039400344324035e-05, |
| "loss": 0.7948, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.7735139202407826, |
| "grad_norm": 0.238205224275589, |
| "learning_rate": 2.9643224293745954e-05, |
| "loss": 0.7637, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.7765237020316027, |
| "grad_norm": 0.26717284321784973, |
| "learning_rate": 2.8900217449447074e-05, |
| "loss": 0.7326, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.7795334838224228, |
| "grad_norm": 0.2582661509513855, |
| "learning_rate": 2.8165064990227252e-05, |
| "loss": 0.7571, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.782543265613243, |
| "grad_norm": 0.22693775594234467, |
| "learning_rate": 2.7437848128296982e-05, |
| "loss": 0.75, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7855530474040632, |
| "grad_norm": 0.21676841378211975, |
| "learning_rate": 2.6718647199222214e-05, |
| "loss": 0.7693, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.7885628291948834, |
| "grad_norm": 0.21357332170009613, |
| "learning_rate": 2.600754165304966e-05, |
| "loss": 0.7499, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.7915726109857035, |
| "grad_norm": 0.2448240965604782, |
| "learning_rate": 2.530461004553001e-05, |
| "loss": 0.7565, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.7945823927765236, |
| "grad_norm": 0.16155965626239777, |
| "learning_rate": 2.460993002943983e-05, |
| "loss": 0.738, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.7975921745673439, |
| "grad_norm": 0.22366459667682648, |
| "learning_rate": 2.3923578346003363e-05, |
| "loss": 0.746, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.800601956358164, |
| "grad_norm": 0.21000875532627106, |
| "learning_rate": 2.32456308164148e-05, |
| "loss": 0.7615, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.8036117381489842, |
| "grad_norm": 0.23698626458644867, |
| "learning_rate": 2.2576162333462402e-05, |
| "loss": 0.8101, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.8066215199398044, |
| "grad_norm": 0.17663483321666718, |
| "learning_rate": 2.191524685325512e-05, |
| "loss": 0.7551, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.8096313017306245, |
| "grad_norm": 0.193350687623024, |
| "learning_rate": 2.126295738705262e-05, |
| "loss": 0.7621, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.8126410835214447, |
| "grad_norm": 0.2506263256072998, |
| "learning_rate": 2.0619365993199747e-05, |
| "loss": 0.7789, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8156508653122648, |
| "grad_norm": 0.23012906312942505, |
| "learning_rate": 1.9984543769166265e-05, |
| "loss": 0.7246, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.8186606471030851, |
| "grad_norm": 0.18312327563762665, |
| "learning_rate": 1.9358560843692787e-05, |
| "loss": 0.7295, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.8216704288939052, |
| "grad_norm": 0.22966724634170532, |
| "learning_rate": 1.8741486369043505e-05, |
| "loss": 0.7851, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.8246802106847254, |
| "grad_norm": 0.160865917801857, |
| "learning_rate": 1.8133388513367078e-05, |
| "loss": 0.7278, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.8276899924755455, |
| "grad_norm": 0.26794126629829407, |
| "learning_rate": 1.7534334453166068e-05, |
| "loss": 0.7247, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8306997742663657, |
| "grad_norm": 0.23241771757602692, |
| "learning_rate": 1.6944390365875952e-05, |
| "loss": 0.7741, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.8337095560571859, |
| "grad_norm": 0.20465601980686188, |
| "learning_rate": 1.6363621422554476e-05, |
| "loss": 0.7267, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.836719337848006, |
| "grad_norm": 0.21765820682048798, |
| "learning_rate": 1.579209178068234e-05, |
| "loss": 0.7566, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.8397291196388262, |
| "grad_norm": 0.21495480835437775, |
| "learning_rate": 1.5229864577075547e-05, |
| "loss": 0.8005, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.8427389014296464, |
| "grad_norm": 0.19566848874092102, |
| "learning_rate": 1.4677001920910827e-05, |
| "loss": 0.7494, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8457486832204665, |
| "grad_norm": 0.1817813217639923, |
| "learning_rate": 1.4133564886864381e-05, |
| "loss": 0.7289, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.8487584650112867, |
| "grad_norm": 0.2588653266429901, |
| "learning_rate": 1.3599613508364984e-05, |
| "loss": 0.7493, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.8517682468021068, |
| "grad_norm": 0.1954454928636551, |
| "learning_rate": 1.307520677096209e-05, |
| "loss": 0.7504, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.8547780285929271, |
| "grad_norm": 0.17466256022453308, |
| "learning_rate": 1.2560402605809707e-05, |
| "loss": 0.7705, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.8577878103837472, |
| "grad_norm": 0.2213626205921173, |
| "learning_rate": 1.2055257883266791e-05, |
| "loss": 0.7307, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.8607975921745673, |
| "grad_norm": 0.19768081605434418, |
| "learning_rate": 1.1559828406614714e-05, |
| "loss": 0.7741, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.8638073739653875, |
| "grad_norm": 0.22028213739395142, |
| "learning_rate": 1.1074168905892702e-05, |
| "loss": 0.7238, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.8668171557562077, |
| "grad_norm": 0.2675631046295166, |
| "learning_rate": 1.0598333031851881e-05, |
| "loss": 0.7315, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.8698269375470279, |
| "grad_norm": 0.20498494803905487, |
| "learning_rate": 1.0132373350028313e-05, |
| "loss": 0.751, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.872836719337848, |
| "grad_norm": 0.22774334251880646, |
| "learning_rate": 9.676341334936346e-06, |
| "loss": 0.72, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8758465011286681, |
| "grad_norm": 0.18757307529449463, |
| "learning_rate": 9.230287364382007e-06, |
| "loss": 0.7433, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.8788562829194884, |
| "grad_norm": 0.23557019233703613, |
| "learning_rate": 8.794260713897862e-06, |
| "loss": 0.7473, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.8818660647103085, |
| "grad_norm": 0.24945016205310822, |
| "learning_rate": 8.368309551299536e-06, |
| "loss": 0.746, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.8848758465011287, |
| "grad_norm": 0.5168988704681396, |
| "learning_rate": 7.952480931364658e-06, |
| "loss": 0.7523, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.8878856282919488, |
| "grad_norm": 0.21044060587882996, |
| "learning_rate": 7.546820790634646e-06, |
| "loss": 0.7359, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.890895410082769, |
| "grad_norm": 0.2304990291595459, |
| "learning_rate": 7.1513739423402e-06, |
| "loss": 0.7201, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.8939051918735892, |
| "grad_norm": 0.5010607242584229, |
| "learning_rate": 6.766184071450721e-06, |
| "loss": 0.7619, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.8969149736644093, |
| "grad_norm": 0.1829194277524948, |
| "learning_rate": 6.391293729848435e-06, |
| "loss": 0.7654, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.8999247554552295, |
| "grad_norm": 0.4193117320537567, |
| "learning_rate": 6.026744331627731e-06, |
| "loss": 0.7416, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.9029345372460497, |
| "grad_norm": 0.2364160269498825, |
| "learning_rate": 5.672576148520137e-06, |
| "loss": 0.7516, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9029345372460497, |
| "eval_loss": 0.7987983226776123, |
| "eval_runtime": 142.8043, |
| "eval_samples_per_second": 39.187, |
| "eval_steps_per_second": 4.902, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9059443190368698, |
| "grad_norm": 0.1936904937028885, |
| "learning_rate": 5.328828305445477e-06, |
| "loss": 0.7357, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.90895410082769, |
| "grad_norm": 0.194386288523674, |
| "learning_rate": 4.9955387761897785e-06, |
| "loss": 0.7725, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.9119638826185101, |
| "grad_norm": 0.24116984009742737, |
| "learning_rate": 4.672744379210336e-06, |
| "loss": 0.776, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.9149736644093304, |
| "grad_norm": 0.1920221745967865, |
| "learning_rate": 4.360480773568321e-06, |
| "loss": 0.7323, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.9179834462001505, |
| "grad_norm": 0.19343051314353943, |
| "learning_rate": 4.058782454989529e-06, |
| "loss": 0.7436, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9209932279909706, |
| "grad_norm": 0.2893059253692627, |
| "learning_rate": 3.767682752053714e-06, |
| "loss": 0.7538, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.9240030097817908, |
| "grad_norm": 0.23565465211868286, |
| "learning_rate": 3.487213822512714e-06, |
| "loss": 0.7535, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.927012791572611, |
| "grad_norm": 0.2228977531194687, |
| "learning_rate": 3.2174066497380086e-06, |
| "loss": 0.7572, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.9300225733634312, |
| "grad_norm": 0.20145723223686218, |
| "learning_rate": 2.958291039298e-06, |
| "loss": 0.7114, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.9330323551542513, |
| "grad_norm": 0.2287394404411316, |
| "learning_rate": 2.709895615665392e-06, |
| "loss": 0.7656, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9360421369450714, |
| "grad_norm": 0.21397553384304047, |
| "learning_rate": 2.472247819055029e-06, |
| "loss": 0.7361, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.9390519187358917, |
| "grad_norm": 0.22272509336471558, |
| "learning_rate": 2.2453739023926113e-06, |
| "loss": 0.7789, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.9420617005267118, |
| "grad_norm": 0.2435009628534317, |
| "learning_rate": 2.0292989284144915e-06, |
| "loss": 0.7756, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.945071482317532, |
| "grad_norm": 0.17544378340244293, |
| "learning_rate": 1.8240467668990457e-06, |
| "loss": 0.7319, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.9480812641083521, |
| "grad_norm": 0.19456711411476135, |
| "learning_rate": 1.6296400920297383e-06, |
| "loss": 0.7469, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9510910458991723, |
| "grad_norm": 0.21780775487422943, |
| "learning_rate": 1.4461003798903695e-06, |
| "loss": 0.7587, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.9541008276899925, |
| "grad_norm": 0.21393659710884094, |
| "learning_rate": 1.2734479060925397e-06, |
| "loss": 0.7504, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.9571106094808126, |
| "grad_norm": 0.21127097308635712, |
| "learning_rate": 1.1117017435358423e-06, |
| "loss": 0.7628, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.9601203912716328, |
| "grad_norm": 0.19377408921718597, |
| "learning_rate": 9.608797603008812e-07, |
| "loss": 0.7195, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.963130173062453, |
| "grad_norm": 0.27100813388824463, |
| "learning_rate": 8.209986176753948e-07, |
| "loss": 0.7476, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9661399548532731, |
| "grad_norm": 0.1997697949409485, |
| "learning_rate": 6.920737683136613e-07, |
| "loss": 0.7578, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.9691497366440933, |
| "grad_norm": 0.3180257976055145, |
| "learning_rate": 5.741194545294648e-07, |
| "loss": 0.7537, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.9721595184349134, |
| "grad_norm": 0.20285405218601227, |
| "learning_rate": 4.671487067227531e-07, |
| "loss": 0.7226, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.9751693002257337, |
| "grad_norm": 0.1977706104516983, |
| "learning_rate": 3.711733419401453e-07, |
| "loss": 0.7487, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.9781790820165538, |
| "grad_norm": 0.2490547150373459, |
| "learning_rate": 2.8620396256953117e-07, |
| "loss": 0.773, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9811888638073739, |
| "grad_norm": 0.18216556310653687, |
| "learning_rate": 2.122499551688084e-07, |
| "loss": 0.7348, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.9841986455981941, |
| "grad_norm": 0.20423290133476257, |
| "learning_rate": 1.4931948942895624e-07, |
| "loss": 0.7116, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.9872084273890143, |
| "grad_norm": 0.3691667914390564, |
| "learning_rate": 9.741951727152421e-08, |
| "loss": 0.7242, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.9902182091798345, |
| "grad_norm": 0.1876600682735443, |
| "learning_rate": 5.655577208069085e-08, |
| "loss": 0.755, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.9932279909706546, |
| "grad_norm": 0.23210102319717407, |
| "learning_rate": 2.6732768069825943e-08, |
| "loss": 0.7697, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9962377727614747, |
| "grad_norm": 0.18966606259346008, |
| "learning_rate": 7.953799782889349e-09, |
| "loss": 0.7166, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.999247554552295, |
| "grad_norm": 0.5191490650177002, |
| "learning_rate": 2.2094173039999277e-10, |
| "loss": 0.7366, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.999849510910459, |
| "step": 3322, |
| "total_flos": 3.156428776263385e+18, |
| "train_loss": 0.7673604141007986, |
| "train_runtime": 8289.1597, |
| "train_samples_per_second": 12.826, |
| "train_steps_per_second": 0.401 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3322, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.156428776263385e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|