{ "best_metric": null, "best_model_checkpoint": null, "epoch": 27.069486404833835, "eval_steps": 100, "global_step": 4200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06445115810674723, "grad_norm": 8.751441955566406, "learning_rate": 5.154639175257732e-07, "loss": 8.7908, "step": 10 }, { "epoch": 0.12890231621349446, "grad_norm": 5.980831146240234, "learning_rate": 1.0309278350515464e-06, "loss": 8.4297, "step": 20 }, { "epoch": 0.1933534743202417, "grad_norm": 3.555551528930664, "learning_rate": 1.5463917525773197e-06, "loss": 7.9847, "step": 30 }, { "epoch": 0.2578046324269889, "grad_norm": 2.1481900215148926, "learning_rate": 2.061855670103093e-06, "loss": 7.5452, "step": 40 }, { "epoch": 0.32225579053373615, "grad_norm": 1.3654990196228027, "learning_rate": 2.577319587628866e-06, "loss": 7.1644, "step": 50 }, { "epoch": 0.3867069486404834, "grad_norm": 1.294124960899353, "learning_rate": 3.0927835051546395e-06, "loss": 6.8552, "step": 60 }, { "epoch": 0.4511581067472306, "grad_norm": 1.3459229469299316, "learning_rate": 3.6082474226804126e-06, "loss": 6.5137, "step": 70 }, { "epoch": 0.5156092648539778, "grad_norm": 1.345466136932373, "learning_rate": 4.123711340206186e-06, "loss": 6.2217, "step": 80 }, { "epoch": 0.5800604229607251, "grad_norm": 1.0641965866088867, "learning_rate": 4.639175257731959e-06, "loss": 5.999, "step": 90 }, { "epoch": 0.6445115810674723, "grad_norm": 1.5514322519302368, "learning_rate": 5.154639175257732e-06, "loss": 5.8344, "step": 100 }, { "epoch": 0.6445115810674723, "eval_loss": 5.791144847869873, "eval_runtime": 20.2064, "eval_samples_per_second": 79.43, "eval_steps_per_second": 4.998, "step": 100 }, { "epoch": 0.7089627391742196, "grad_norm": 9.654533386230469, "learning_rate": 5.670103092783505e-06, "loss": 5.7513, "step": 110 }, { "epoch": 0.7734138972809668, "grad_norm": 2.767559289932251, "learning_rate": 6.185567010309279e-06, "loss": 5.6479, "step": 120 }, { "epoch": 0.837865055387714, "grad_norm": 1.2786972522735596, "learning_rate": 6.701030927835052e-06, "loss": 5.5542, "step": 130 }, { "epoch": 0.9023162134944612, "grad_norm": 1.3562631607055664, "learning_rate": 7.216494845360825e-06, "loss": 5.4406, "step": 140 }, { "epoch": 0.9667673716012085, "grad_norm": 3.51896333694458, "learning_rate": 7.731958762886599e-06, "loss": 5.323, "step": 150 }, { "epoch": 1.0312185297079557, "grad_norm": 1.808701753616333, "learning_rate": 8.247422680412371e-06, "loss": 5.6701, "step": 160 }, { "epoch": 1.095669687814703, "grad_norm": 3.2831215858459473, "learning_rate": 8.762886597938146e-06, "loss": 5.1268, "step": 170 }, { "epoch": 1.1601208459214503, "grad_norm": 2.606133460998535, "learning_rate": 9.278350515463918e-06, "loss": 5.0758, "step": 180 }, { "epoch": 1.2245720040281973, "grad_norm": 1.7396525144577026, "learning_rate": 9.793814432989691e-06, "loss": 5.0134, "step": 190 }, { "epoch": 1.2890231621349446, "grad_norm": 0.7748392820358276, "learning_rate": 1.0309278350515464e-05, "loss": 4.9703, "step": 200 }, { "epoch": 1.2890231621349446, "eval_loss": 4.946072578430176, "eval_runtime": 19.5278, "eval_samples_per_second": 82.19, "eval_steps_per_second": 5.172, "step": 200 }, { "epoch": 1.353474320241692, "grad_norm": 2.377145290374756, "learning_rate": 1.0824742268041238e-05, "loss": 4.874, "step": 210 }, { "epoch": 1.417925478348439, "grad_norm": 1.2470804452896118, "learning_rate": 1.134020618556701e-05, "loss": 4.8473, "step": 220 }, { "epoch": 1.4823766364551862, "grad_norm": 2.26425838470459, "learning_rate": 1.1855670103092785e-05, "loss": 4.7739, "step": 230 }, { "epoch": 1.5468277945619335, "grad_norm": 1.238208532333374, "learning_rate": 1.2371134020618558e-05, "loss": 4.7524, "step": 240 }, { "epoch": 1.6112789526686808, "grad_norm": 0.9819815158843994, "learning_rate": 1.2886597938144332e-05, "loss": 4.6977, "step": 250 }, { "epoch": 1.675730110775428, "grad_norm": 1.37124502658844, "learning_rate": 1.3402061855670103e-05, "loss": 4.5812, "step": 260 }, { "epoch": 1.7401812688821754, "grad_norm": 1.0728330612182617, "learning_rate": 1.3917525773195878e-05, "loss": 4.5445, "step": 270 }, { "epoch": 1.8046324269889225, "grad_norm": 1.4190095663070679, "learning_rate": 1.443298969072165e-05, "loss": 4.4797, "step": 280 }, { "epoch": 1.8690835850956697, "grad_norm": 1.3823643922805786, "learning_rate": 1.4948453608247425e-05, "loss": 4.3833, "step": 290 }, { "epoch": 1.9335347432024168, "grad_norm": 1.6940747499465942, "learning_rate": 1.5463917525773197e-05, "loss": 4.2822, "step": 300 }, { "epoch": 1.9335347432024168, "eval_loss": 4.213192462921143, "eval_runtime": 19.7585, "eval_samples_per_second": 81.231, "eval_steps_per_second": 5.112, "step": 300 }, { "epoch": 1.997985901309164, "grad_norm": 1.6004289388656616, "learning_rate": 1.597938144329897e-05, "loss": 4.1608, "step": 310 }, { "epoch": 2.0624370594159114, "grad_norm": 2.5717203617095947, "learning_rate": 1.6494845360824743e-05, "loss": 4.392, "step": 320 }, { "epoch": 2.1268882175226587, "grad_norm": 1.9171743392944336, "learning_rate": 1.7010309278350517e-05, "loss": 3.9273, "step": 330 }, { "epoch": 2.191339375629406, "grad_norm": 2.1511971950531006, "learning_rate": 1.752577319587629e-05, "loss": 3.8527, "step": 340 }, { "epoch": 2.2557905337361532, "grad_norm": 1.949204683303833, "learning_rate": 1.8041237113402062e-05, "loss": 3.7843, "step": 350 }, { "epoch": 2.3202416918429005, "grad_norm": 1.6854971647262573, "learning_rate": 1.8556701030927837e-05, "loss": 3.7476, "step": 360 }, { "epoch": 2.3846928499496474, "grad_norm": 1.9099489450454712, "learning_rate": 1.907216494845361e-05, "loss": 3.6868, "step": 370 }, { "epoch": 2.4491440080563947, "grad_norm": 1.9068998098373413, "learning_rate": 1.9587628865979382e-05, "loss": 3.6684, "step": 380 }, { "epoch": 2.513595166163142, "grad_norm": 1.3064167499542236, "learning_rate": 1.9999996358015542e-05, "loss": 3.6028, "step": 390 }, { "epoch": 2.5780463242698892, "grad_norm": 1.7267900705337524, "learning_rate": 1.9999868888837957e-05, "loss": 3.5599, "step": 400 }, { "epoch": 2.5780463242698892, "eval_loss": 3.5509181022644043, "eval_runtime": 20.1957, "eval_samples_per_second": 79.472, "eval_steps_per_second": 5.001, "step": 400 }, { "epoch": 2.6424974823766365, "grad_norm": 2.2463932037353516, "learning_rate": 1.9999559323090132e-05, "loss": 3.5059, "step": 410 }, { "epoch": 2.706948640483384, "grad_norm": 2.046381950378418, "learning_rate": 1.9999067666409225e-05, "loss": 3.4585, "step": 420 }, { "epoch": 2.771399798590131, "grad_norm": 1.8293402194976807, "learning_rate": 1.9998393927748257e-05, "loss": 3.4393, "step": 430 }, { "epoch": 2.835850956696878, "grad_norm": 1.701651930809021, "learning_rate": 1.9997538119375938e-05, "loss": 3.3986, "step": 440 }, { "epoch": 2.900302114803625, "grad_norm": 1.8549060821533203, "learning_rate": 1.9996500256876447e-05, "loss": 3.3568, "step": 450 }, { "epoch": 2.9647532729103725, "grad_norm": 2.38415789604187, "learning_rate": 1.999528035914915e-05, "loss": 3.3115, "step": 460 }, { "epoch": 3.02920443101712, "grad_norm": 3.2878332138061523, "learning_rate": 1.9993878448408263e-05, "loss": 3.5941, "step": 470 }, { "epoch": 3.093655589123867, "grad_norm": 2.5062038898468018, "learning_rate": 1.999229455018243e-05, "loss": 3.2588, "step": 480 }, { "epoch": 3.1581067472306144, "grad_norm": 2.2001876831054688, "learning_rate": 1.9990528693314273e-05, "loss": 3.2381, "step": 490 }, { "epoch": 3.2225579053373616, "grad_norm": 2.3333001136779785, "learning_rate": 1.9988580909959864e-05, "loss": 3.2271, "step": 500 }, { "epoch": 3.2225579053373616, "eval_loss": 3.236539840698242, "eval_runtime": 19.5719, "eval_samples_per_second": 82.005, "eval_steps_per_second": 5.16, "step": 500 }, { "epoch": 3.287009063444109, "grad_norm": 2.109981060028076, "learning_rate": 1.9986451235588135e-05, "loss": 3.2038, "step": 510 }, { "epoch": 3.351460221550856, "grad_norm": 2.376877546310425, "learning_rate": 1.9984139708980228e-05, "loss": 3.1678, "step": 520 }, { "epoch": 3.415911379657603, "grad_norm": 2.0894505977630615, "learning_rate": 1.9981646372228813e-05, "loss": 3.1701, "step": 530 }, { "epoch": 3.4803625377643503, "grad_norm": 1.9505690336227417, "learning_rate": 1.997897127073728e-05, "loss": 3.1902, "step": 540 }, { "epoch": 3.5448136958710976, "grad_norm": 3.200566530227661, "learning_rate": 1.997611445321896e-05, "loss": 3.1286, "step": 550 }, { "epoch": 3.609264853977845, "grad_norm": 2.295381784439087, "learning_rate": 1.9973075971696195e-05, "loss": 3.1308, "step": 560 }, { "epoch": 3.673716012084592, "grad_norm": 2.323788642883301, "learning_rate": 1.9969855881499413e-05, "loss": 3.1093, "step": 570 }, { "epoch": 3.7381671701913395, "grad_norm": 1.9054023027420044, "learning_rate": 1.996645424126613e-05, "loss": 3.1029, "step": 580 }, { "epoch": 3.8026183282980868, "grad_norm": 2.854268789291382, "learning_rate": 1.996287111293986e-05, "loss": 3.0843, "step": 590 }, { "epoch": 3.8670694864048336, "grad_norm": 2.066882848739624, "learning_rate": 1.9959106561768988e-05, "loss": 3.0301, "step": 600 }, { "epoch": 3.8670694864048336, "eval_loss": 3.056886672973633, "eval_runtime": 20.2056, "eval_samples_per_second": 79.433, "eval_steps_per_second": 4.999, "step": 600 }, { "epoch": 3.931520644511581, "grad_norm": 2.220766544342041, "learning_rate": 1.9955160656305606e-05, "loss": 3.02, "step": 610 }, { "epoch": 3.995971802618328, "grad_norm": 2.461122751235962, "learning_rate": 1.995103346840424e-05, "loss": 3.0121, "step": 620 }, { "epoch": 4.0604229607250755, "grad_norm": 1.9937182664871216, "learning_rate": 1.9946725073220542e-05, "loss": 3.249, "step": 630 }, { "epoch": 4.124874118831823, "grad_norm": 2.3354651927948, "learning_rate": 1.9942235549209955e-05, "loss": 2.9879, "step": 640 }, { "epoch": 4.18932527693857, "grad_norm": 2.059208393096924, "learning_rate": 1.9937564978126233e-05, "loss": 2.987, "step": 650 }, { "epoch": 4.253776435045317, "grad_norm": 2.804398775100708, "learning_rate": 1.9932713445019993e-05, "loss": 2.9377, "step": 660 }, { "epoch": 4.318227593152065, "grad_norm": 2.1567623615264893, "learning_rate": 1.992768103823714e-05, "loss": 2.9478, "step": 670 }, { "epoch": 4.382678751258812, "grad_norm": 2.021939992904663, "learning_rate": 1.9922467849417288e-05, "loss": 2.9119, "step": 680 }, { "epoch": 4.447129909365559, "grad_norm": 1.5279889106750488, "learning_rate": 1.9917073973492055e-05, "loss": 2.9033, "step": 690 }, { "epoch": 4.5115810674723065, "grad_norm": 1.7887712717056274, "learning_rate": 1.991149950868336e-05, "loss": 2.8944, "step": 700 }, { "epoch": 4.5115810674723065, "eval_loss": 2.8965914249420166, "eval_runtime": 21.0307, "eval_samples_per_second": 76.317, "eval_steps_per_second": 4.802, "step": 700 }, { "epoch": 4.576032225579054, "grad_norm": 1.7073420286178589, "learning_rate": 1.9905744556501627e-05, "loss": 2.8471, "step": 710 }, { "epoch": 4.640483383685801, "grad_norm": 1.862641453742981, "learning_rate": 1.989980922174394e-05, "loss": 2.8432, "step": 720 }, { "epoch": 4.704934541792548, "grad_norm": 1.9594634771347046, "learning_rate": 1.9893693612492116e-05, "loss": 2.8482, "step": 730 }, { "epoch": 4.769385699899295, "grad_norm": 1.6772149801254272, "learning_rate": 1.988739784011077e-05, "loss": 2.8538, "step": 740 }, { "epoch": 4.833836858006042, "grad_norm": 1.9590495824813843, "learning_rate": 1.9880922019245258e-05, "loss": 2.8703, "step": 750 }, { "epoch": 4.898288016112789, "grad_norm": 1.4671125411987305, "learning_rate": 1.9874266267819604e-05, "loss": 2.8078, "step": 760 }, { "epoch": 4.962739174219537, "grad_norm": 1.8790684938430786, "learning_rate": 1.986743070703435e-05, "loss": 2.779, "step": 770 }, { "epoch": 5.027190332326284, "grad_norm": 1.6177160739898682, "learning_rate": 1.9860415461364343e-05, "loss": 3.0088, "step": 780 }, { "epoch": 5.091641490433031, "grad_norm": 1.774277687072754, "learning_rate": 1.9853220658556474e-05, "loss": 2.7841, "step": 790 }, { "epoch": 5.1560926485397784, "grad_norm": 1.4572412967681885, "learning_rate": 1.984584642962735e-05, "loss": 2.7748, "step": 800 }, { "epoch": 5.1560926485397784, "eval_loss": 2.7890875339508057, "eval_runtime": 19.598, "eval_samples_per_second": 81.896, "eval_steps_per_second": 5.154, "step": 800 }, { "epoch": 5.220543806646526, "grad_norm": 1.4425498247146606, "learning_rate": 1.9838292908860922e-05, "loss": 2.7712, "step": 810 }, { "epoch": 5.284994964753273, "grad_norm": 1.4506257772445679, "learning_rate": 1.9830560233806006e-05, "loss": 2.758, "step": 820 }, { "epoch": 5.34944612286002, "grad_norm": 1.3948203325271606, "learning_rate": 1.982264854527381e-05, "loss": 2.7487, "step": 830 }, { "epoch": 5.413897280966768, "grad_norm": 1.170486330986023, "learning_rate": 1.9814557987335363e-05, "loss": 2.7558, "step": 840 }, { "epoch": 5.478348439073515, "grad_norm": 1.3575382232666016, "learning_rate": 1.980628870731888e-05, "loss": 2.7266, "step": 850 }, { "epoch": 5.542799597180262, "grad_norm": 1.3390381336212158, "learning_rate": 1.979784085580708e-05, "loss": 2.7156, "step": 860 }, { "epoch": 5.6072507552870094, "grad_norm": 1.3795325756072998, "learning_rate": 1.978921458663447e-05, "loss": 2.7069, "step": 870 }, { "epoch": 5.671701913393756, "grad_norm": 1.2567253112792969, "learning_rate": 1.9780410056884505e-05, "loss": 2.7121, "step": 880 }, { "epoch": 5.736153071500503, "grad_norm": 1.6211382150650024, "learning_rate": 1.977142742688676e-05, "loss": 2.7006, "step": 890 }, { "epoch": 5.80060422960725, "grad_norm": 1.5447043180465698, "learning_rate": 1.9762266860213982e-05, "loss": 2.6635, "step": 900 }, { "epoch": 5.80060422960725, "eval_loss": 2.704418659210205, "eval_runtime": 20.2115, "eval_samples_per_second": 79.41, "eval_steps_per_second": 4.997, "step": 900 }, { "epoch": 5.865055387713998, "grad_norm": 1.28032648563385, "learning_rate": 1.9752928523679145e-05, "loss": 2.7062, "step": 910 }, { "epoch": 5.929506545820745, "grad_norm": 1.323886752128601, "learning_rate": 1.974341258733238e-05, "loss": 2.6955, "step": 920 }, { "epoch": 5.993957703927492, "grad_norm": 1.370737910270691, "learning_rate": 1.9733719224457896e-05, "loss": 2.6607, "step": 930 }, { "epoch": 6.05840886203424, "grad_norm": 1.2304973602294922, "learning_rate": 1.972384861157082e-05, "loss": 2.8645, "step": 940 }, { "epoch": 6.122860020140987, "grad_norm": 1.251336693763733, "learning_rate": 1.9713800928413987e-05, "loss": 2.678, "step": 950 }, { "epoch": 6.187311178247734, "grad_norm": 1.4331055879592896, "learning_rate": 1.9703576357954653e-05, "loss": 2.6339, "step": 960 }, { "epoch": 6.251762336354481, "grad_norm": 1.4404268264770508, "learning_rate": 1.969317508638119e-05, "loss": 2.6497, "step": 970 }, { "epoch": 6.316213494461229, "grad_norm": 1.2648017406463623, "learning_rate": 1.9682597303099663e-05, "loss": 2.6194, "step": 980 }, { "epoch": 6.380664652567976, "grad_norm": 1.464142084121704, "learning_rate": 1.9671843200730408e-05, "loss": 2.6342, "step": 990 }, { "epoch": 6.445115810674723, "grad_norm": 1.4572694301605225, "learning_rate": 1.96609129751045e-05, "loss": 2.6102, "step": 1000 }, { "epoch": 6.445115810674723, "eval_loss": 2.6344494819641113, "eval_runtime": 19.7801, "eval_samples_per_second": 81.142, "eval_steps_per_second": 5.106, "step": 1000 }, { "epoch": 6.509566968781471, "grad_norm": 1.3998351097106934, "learning_rate": 1.9649806825260215e-05, "loss": 2.6117, "step": 1010 }, { "epoch": 6.574018126888218, "grad_norm": 1.2323154211044312, "learning_rate": 1.9638524953439385e-05, "loss": 2.5907, "step": 1020 }, { "epoch": 6.638469284994965, "grad_norm": 1.5451122522354126, "learning_rate": 1.9627067565083716e-05, "loss": 2.5988, "step": 1030 }, { "epoch": 6.702920443101712, "grad_norm": 1.4288865327835083, "learning_rate": 1.9615434868831057e-05, "loss": 2.6088, "step": 1040 }, { "epoch": 6.76737160120846, "grad_norm": 1.4163669347763062, "learning_rate": 1.9603627076511595e-05, "loss": 2.5787, "step": 1050 }, { "epoch": 6.831822759315206, "grad_norm": 1.4155839681625366, "learning_rate": 1.9591644403143997e-05, "loss": 2.5802, "step": 1060 }, { "epoch": 6.896273917421953, "grad_norm": 1.2025197744369507, "learning_rate": 1.9579487066931495e-05, "loss": 2.56, "step": 1070 }, { "epoch": 6.960725075528701, "grad_norm": 1.331957459449768, "learning_rate": 1.956715528925792e-05, "loss": 2.5797, "step": 1080 }, { "epoch": 7.025176233635448, "grad_norm": 1.543091893196106, "learning_rate": 1.955464929468365e-05, "loss": 2.7584, "step": 1090 }, { "epoch": 7.089627391742195, "grad_norm": 1.4929845333099365, "learning_rate": 1.954196931094155e-05, "loss": 2.5724, "step": 1100 }, { "epoch": 7.089627391742195, "eval_loss": 2.5758652687072754, "eval_runtime": 19.7829, "eval_samples_per_second": 81.131, "eval_steps_per_second": 5.105, "step": 1100 }, { "epoch": 7.1540785498489425, "grad_norm": 1.3239967823028564, "learning_rate": 1.9529115568932796e-05, "loss": 2.5215, "step": 1110 }, { "epoch": 7.21852970795569, "grad_norm": 1.2997671365737915, "learning_rate": 1.9516088302722696e-05, "loss": 2.5256, "step": 1120 }, { "epoch": 7.282980866062437, "grad_norm": 1.4063647985458374, "learning_rate": 1.9502887749536406e-05, "loss": 2.5286, "step": 1130 }, { "epoch": 7.347432024169184, "grad_norm": 1.625854730606079, "learning_rate": 1.9489514149754624e-05, "loss": 2.5245, "step": 1140 }, { "epoch": 7.411883182275932, "grad_norm": 1.2255531549453735, "learning_rate": 1.9475967746909212e-05, "loss": 2.5373, "step": 1150 }, { "epoch": 7.476334340382679, "grad_norm": 1.3738540410995483, "learning_rate": 1.946224878767875e-05, "loss": 2.5108, "step": 1160 }, { "epoch": 7.540785498489426, "grad_norm": 1.7884373664855957, "learning_rate": 1.9448357521884057e-05, "loss": 2.512, "step": 1170 }, { "epoch": 7.6052366565961735, "grad_norm": 1.4379594326019287, "learning_rate": 1.9434294202483634e-05, "loss": 2.5219, "step": 1180 }, { "epoch": 7.669687814702921, "grad_norm": 1.5573759078979492, "learning_rate": 1.9420059085569062e-05, "loss": 2.4937, "step": 1190 }, { "epoch": 7.734138972809667, "grad_norm": 1.5050828456878662, "learning_rate": 1.940565243036034e-05, "loss": 2.5267, "step": 1200 }, { "epoch": 7.734138972809667, "eval_loss": 2.5181825160980225, "eval_runtime": 19.5726, "eval_samples_per_second": 82.002, "eval_steps_per_second": 5.16, "step": 1200 }, { "epoch": 7.7985901309164145, "grad_norm": 1.6680591106414795, "learning_rate": 1.9391074499201155e-05, "loss": 2.5034, "step": 1210 }, { "epoch": 7.863041289023162, "grad_norm": 1.1828731298446655, "learning_rate": 1.9376325557554113e-05, "loss": 2.493, "step": 1220 }, { "epoch": 7.927492447129909, "grad_norm": 1.5516493320465088, "learning_rate": 1.9361405873995904e-05, "loss": 2.4866, "step": 1230 }, { "epoch": 7.991943605236656, "grad_norm": 1.2538108825683594, "learning_rate": 1.9346315720212416e-05, "loss": 2.4935, "step": 1240 }, { "epoch": 8.056394763343404, "grad_norm": 1.4746509790420532, "learning_rate": 1.933105537099377e-05, "loss": 2.6784, "step": 1250 }, { "epoch": 8.120845921450151, "grad_norm": 1.491493821144104, "learning_rate": 1.9315625104229336e-05, "loss": 2.484, "step": 1260 }, { "epoch": 8.185297079556898, "grad_norm": 1.5967274904251099, "learning_rate": 1.9300025200902666e-05, "loss": 2.4746, "step": 1270 }, { "epoch": 8.249748237663646, "grad_norm": 1.477860689163208, "learning_rate": 1.928425594508637e-05, "loss": 2.4447, "step": 1280 }, { "epoch": 8.314199395770393, "grad_norm": 1.4457802772521973, "learning_rate": 1.9268317623936957e-05, "loss": 2.4438, "step": 1290 }, { "epoch": 8.37865055387714, "grad_norm": 1.1860662698745728, "learning_rate": 1.9252210527689596e-05, "loss": 2.4863, "step": 1300 }, { "epoch": 8.37865055387714, "eval_loss": 2.4702823162078857, "eval_runtime": 20.2158, "eval_samples_per_second": 79.393, "eval_steps_per_second": 4.996, "step": 1300 }, { "epoch": 8.443101711983887, "grad_norm": 1.36308753490448, "learning_rate": 1.9235934949652825e-05, "loss": 2.4382, "step": 1310 }, { "epoch": 8.507552870090635, "grad_norm": 1.0049257278442383, "learning_rate": 1.9219491186203222e-05, "loss": 2.4478, "step": 1320 }, { "epoch": 8.572004028197382, "grad_norm": 1.4377230405807495, "learning_rate": 1.9202879536780013e-05, "loss": 2.4276, "step": 1330 }, { "epoch": 8.63645518630413, "grad_norm": 1.3836121559143066, "learning_rate": 1.91861003038796e-05, "loss": 2.4464, "step": 1340 }, { "epoch": 8.700906344410877, "grad_norm": 1.3200100660324097, "learning_rate": 1.9169153793050065e-05, "loss": 2.438, "step": 1350 }, { "epoch": 8.765357502517624, "grad_norm": 1.4599627256393433, "learning_rate": 1.9152040312885604e-05, "loss": 2.4192, "step": 1360 }, { "epoch": 8.829808660624371, "grad_norm": 1.2848976850509644, "learning_rate": 1.9134760175020906e-05, "loss": 2.4119, "step": 1370 }, { "epoch": 8.894259818731118, "grad_norm": 1.195146083831787, "learning_rate": 1.9117313694125482e-05, "loss": 2.4047, "step": 1380 }, { "epoch": 8.958710976837866, "grad_norm": 1.5599223375320435, "learning_rate": 1.9099701187897927e-05, "loss": 2.4238, "step": 1390 }, { "epoch": 9.023162134944613, "grad_norm": 1.7016339302062988, "learning_rate": 1.9081922977060146e-05, "loss": 2.5924, "step": 1400 }, { "epoch": 9.023162134944613, "eval_loss": 2.425938844680786, "eval_runtime": 20.1925, "eval_samples_per_second": 79.485, "eval_steps_per_second": 5.002, "step": 1400 }, { "epoch": 9.08761329305136, "grad_norm": 1.5949656963348389, "learning_rate": 1.9063979385351512e-05, "loss": 2.4341, "step": 1410 }, { "epoch": 9.152064451158108, "grad_norm": 1.4340884685516357, "learning_rate": 1.9045870739522953e-05, "loss": 2.4071, "step": 1420 }, { "epoch": 9.216515609264855, "grad_norm": 1.2564432621002197, "learning_rate": 1.902759736933102e-05, "loss": 2.3866, "step": 1430 }, { "epoch": 9.280966767371602, "grad_norm": 1.338267207145691, "learning_rate": 1.9009159607531886e-05, "loss": 2.3687, "step": 1440 }, { "epoch": 9.345417925478348, "grad_norm": 1.4795355796813965, "learning_rate": 1.8990557789875265e-05, "loss": 2.3932, "step": 1450 }, { "epoch": 9.409869083585095, "grad_norm": 1.3495395183563232, "learning_rate": 1.8971792255098326e-05, "loss": 2.3738, "step": 1460 }, { "epoch": 9.474320241691842, "grad_norm": 1.5323916673660278, "learning_rate": 1.8952863344919495e-05, "loss": 2.3773, "step": 1470 }, { "epoch": 9.53877139979859, "grad_norm": 1.2470097541809082, "learning_rate": 1.893377140403225e-05, "loss": 2.364, "step": 1480 }, { "epoch": 9.603222557905337, "grad_norm": 1.2765355110168457, "learning_rate": 1.891451678009886e-05, "loss": 2.3414, "step": 1490 }, { "epoch": 9.667673716012084, "grad_norm": 1.2045294046401978, "learning_rate": 1.8895099823744005e-05, "loss": 2.3869, "step": 1500 }, { "epoch": 9.667673716012084, "eval_loss": 2.387402057647705, "eval_runtime": 20.2004, "eval_samples_per_second": 79.454, "eval_steps_per_second": 5.0, "step": 1500 }, { "epoch": 9.732124874118831, "grad_norm": 1.488773226737976, "learning_rate": 1.887552088854844e-05, "loss": 2.3653, "step": 1510 }, { "epoch": 9.796576032225579, "grad_norm": 1.442879319190979, "learning_rate": 1.8855780331042538e-05, "loss": 2.3483, "step": 1520 }, { "epoch": 9.861027190332326, "grad_norm": 1.2660679817199707, "learning_rate": 1.8835878510699793e-05, "loss": 2.3569, "step": 1530 }, { "epoch": 9.925478348439073, "grad_norm": 1.575183391571045, "learning_rate": 1.8815815789930277e-05, "loss": 2.3385, "step": 1540 }, { "epoch": 9.98992950654582, "grad_norm": 1.270186185836792, "learning_rate": 1.8795592534074045e-05, "loss": 2.3276, "step": 1550 }, { "epoch": 10.054380664652568, "grad_norm": 1.4595534801483154, "learning_rate": 1.877520911139448e-05, "loss": 2.5222, "step": 1560 }, { "epoch": 10.118831822759315, "grad_norm": 1.2067569494247437, "learning_rate": 1.8754665893071583e-05, "loss": 2.3175, "step": 1570 }, { "epoch": 10.183282980866062, "grad_norm": 1.1168224811553955, "learning_rate": 1.8733963253195217e-05, "loss": 2.3505, "step": 1580 }, { "epoch": 10.24773413897281, "grad_norm": 1.3407734632492065, "learning_rate": 1.8713101568758295e-05, "loss": 2.3406, "step": 1590 }, { "epoch": 10.312185297079557, "grad_norm": 1.2485662698745728, "learning_rate": 1.8692081219649926e-05, "loss": 2.3084, "step": 1600 }, { "epoch": 10.312185297079557, "eval_loss": 2.3460793495178223, "eval_runtime": 19.8125, "eval_samples_per_second": 81.009, "eval_steps_per_second": 5.098, "step": 1600 }, { "epoch": 10.376636455186304, "grad_norm": 1.335404872894287, "learning_rate": 1.8670902588648467e-05, "loss": 2.3151, "step": 1610 }, { "epoch": 10.441087613293051, "grad_norm": 1.5283225774765015, "learning_rate": 1.8649566061414583e-05, "loss": 2.3153, "step": 1620 }, { "epoch": 10.505538771399799, "grad_norm": 1.2574470043182373, "learning_rate": 1.8628072026484215e-05, "loss": 2.303, "step": 1630 }, { "epoch": 10.569989929506546, "grad_norm": 1.3589491844177246, "learning_rate": 1.8606420875261492e-05, "loss": 2.3191, "step": 1640 }, { "epoch": 10.634441087613293, "grad_norm": 1.3146897554397583, "learning_rate": 1.858461300201163e-05, "loss": 2.3128, "step": 1650 }, { "epoch": 10.69889224572004, "grad_norm": 1.363336205482483, "learning_rate": 1.856264880385372e-05, "loss": 2.2795, "step": 1660 }, { "epoch": 10.763343403826788, "grad_norm": 1.402933120727539, "learning_rate": 1.8540528680753525e-05, "loss": 2.2819, "step": 1670 }, { "epoch": 10.827794561933535, "grad_norm": 1.199621558189392, "learning_rate": 1.851825303551618e-05, "loss": 2.2937, "step": 1680 }, { "epoch": 10.892245720040282, "grad_norm": 1.4294910430908203, "learning_rate": 1.8495822273778867e-05, "loss": 2.2985, "step": 1690 }, { "epoch": 10.95669687814703, "grad_norm": 1.211239218711853, "learning_rate": 1.8473236804003412e-05, "loss": 2.2803, "step": 1700 }, { "epoch": 10.95669687814703, "eval_loss": 2.2990825176239014, "eval_runtime": 19.7801, "eval_samples_per_second": 81.142, "eval_steps_per_second": 5.106, "step": 1700 }, { "epoch": 11.021148036253777, "grad_norm": 1.610541582107544, "learning_rate": 1.8450497037468876e-05, "loss": 2.4507, "step": 1710 }, { "epoch": 11.085599194360524, "grad_norm": 1.265767216682434, "learning_rate": 1.8427603388264027e-05, "loss": 2.2681, "step": 1720 }, { "epoch": 11.150050352467272, "grad_norm": 1.1170353889465332, "learning_rate": 1.8404556273279835e-05, "loss": 2.2513, "step": 1730 }, { "epoch": 11.214501510574019, "grad_norm": 1.5260467529296875, "learning_rate": 1.8381356112201863e-05, "loss": 2.2696, "step": 1740 }, { "epoch": 11.278952668680766, "grad_norm": 1.3105249404907227, "learning_rate": 1.835800332750263e-05, "loss": 2.2703, "step": 1750 }, { "epoch": 11.343403826787512, "grad_norm": 1.692631483078003, "learning_rate": 1.8334498344433903e-05, "loss": 2.2637, "step": 1760 }, { "epoch": 11.407854984894259, "grad_norm": 1.5331857204437256, "learning_rate": 1.8310841591018977e-05, "loss": 2.2499, "step": 1770 }, { "epoch": 11.472306143001006, "grad_norm": 1.3038716316223145, "learning_rate": 1.828703349804487e-05, "loss": 2.2658, "step": 1780 }, { "epoch": 11.536757301107754, "grad_norm": 1.1918790340423584, "learning_rate": 1.826307449905447e-05, "loss": 2.2517, "step": 1790 }, { "epoch": 11.6012084592145, "grad_norm": 1.4929193258285522, "learning_rate": 1.823896503033865e-05, "loss": 2.2393, "step": 1800 }, { "epoch": 11.6012084592145, "eval_loss": 2.256037712097168, "eval_runtime": 21.0369, "eval_samples_per_second": 76.294, "eval_steps_per_second": 4.801, "step": 1800 }, { "epoch": 11.665659617321248, "grad_norm": 1.3841142654418945, "learning_rate": 1.8214705530928322e-05, "loss": 2.228, "step": 1810 }, { "epoch": 11.730110775427995, "grad_norm": 1.3609843254089355, "learning_rate": 1.819029644258645e-05, "loss": 2.2211, "step": 1820 }, { "epoch": 11.794561933534743, "grad_norm": 1.7460395097732544, "learning_rate": 1.816573820979998e-05, "loss": 2.2081, "step": 1830 }, { "epoch": 11.85901309164149, "grad_norm": 1.2562885284423828, "learning_rate": 1.8141031279771777e-05, "loss": 2.2201, "step": 1840 }, { "epoch": 11.923464249748237, "grad_norm": 1.5480189323425293, "learning_rate": 1.811617610241246e-05, "loss": 2.2099, "step": 1850 }, { "epoch": 11.987915407854985, "grad_norm": 1.1817470788955688, "learning_rate": 1.8091173130332214e-05, "loss": 2.1935, "step": 1860 }, { "epoch": 12.052366565961732, "grad_norm": 1.283387541770935, "learning_rate": 1.8066022818832564e-05, "loss": 2.3754, "step": 1870 }, { "epoch": 12.11681772406848, "grad_norm": 1.4642541408538818, "learning_rate": 1.804072562589805e-05, "loss": 2.1914, "step": 1880 }, { "epoch": 12.181268882175226, "grad_norm": 1.3333503007888794, "learning_rate": 1.8015282012187927e-05, "loss": 2.1918, "step": 1890 }, { "epoch": 12.245720040281974, "grad_norm": 1.4526790380477905, "learning_rate": 1.7989692441027744e-05, "loss": 2.1748, "step": 1900 }, { "epoch": 12.245720040281974, "eval_loss": 2.212247133255005, "eval_runtime": 20.2138, "eval_samples_per_second": 79.401, "eval_steps_per_second": 4.997, "step": 1900 }, { "epoch": 12.310171198388721, "grad_norm": 1.2756415605545044, "learning_rate": 1.796395737840093e-05, "loss": 2.2262, "step": 1910 }, { "epoch": 12.374622356495468, "grad_norm": 1.2323824167251587, "learning_rate": 1.7938077292940288e-05, "loss": 2.1796, "step": 1920 }, { "epoch": 12.439073514602216, "grad_norm": 1.2132095098495483, "learning_rate": 1.7912052655919478e-05, "loss": 2.1934, "step": 1930 }, { "epoch": 12.503524672708963, "grad_norm": 1.45913827419281, "learning_rate": 1.7885883941244432e-05, "loss": 2.169, "step": 1940 }, { "epoch": 12.56797583081571, "grad_norm": 1.398886799812317, "learning_rate": 1.7859571625444712e-05, "loss": 2.1845, "step": 1950 }, { "epoch": 12.632426988922457, "grad_norm": 1.386767029762268, "learning_rate": 1.7833116187664846e-05, "loss": 2.1563, "step": 1960 }, { "epoch": 12.696878147029205, "grad_norm": 1.4831831455230713, "learning_rate": 1.7806518109655604e-05, "loss": 2.1592, "step": 1970 }, { "epoch": 12.761329305135952, "grad_norm": 1.3067753314971924, "learning_rate": 1.777977787576521e-05, "loss": 2.1587, "step": 1980 }, { "epoch": 12.8257804632427, "grad_norm": 1.4871938228607178, "learning_rate": 1.7752895972930538e-05, "loss": 2.1575, "step": 1990 }, { "epoch": 12.890231621349447, "grad_norm": 1.3559268712997437, "learning_rate": 1.772587289066823e-05, "loss": 2.1365, "step": 2000 }, { "epoch": 12.890231621349447, "eval_loss": 2.167628526687622, "eval_runtime": 19.7767, "eval_samples_per_second": 81.156, "eval_steps_per_second": 5.107, "step": 2000 }, { "epoch": 12.954682779456194, "grad_norm": 1.4033281803131104, "learning_rate": 1.769870912106581e-05, "loss": 2.1484, "step": 2010 }, { "epoch": 13.019133937562941, "grad_norm": 1.3120031356811523, "learning_rate": 1.7671405158772686e-05, "loss": 2.3176, "step": 2020 }, { "epoch": 13.083585095669688, "grad_norm": 1.3632259368896484, "learning_rate": 1.764396150099116e-05, "loss": 2.1399, "step": 2030 }, { "epoch": 13.148036253776436, "grad_norm": 1.3641666173934937, "learning_rate": 1.7616378647467387e-05, "loss": 2.1302, "step": 2040 }, { "epoch": 13.212487411883183, "grad_norm": 1.2037588357925415, "learning_rate": 1.758865710048225e-05, "loss": 2.1169, "step": 2050 }, { "epoch": 13.27693856998993, "grad_norm": 1.3655686378479004, "learning_rate": 1.7560797364842235e-05, "loss": 2.1228, "step": 2060 }, { "epoch": 13.341389728096678, "grad_norm": 1.2204645872116089, "learning_rate": 1.7532799947870224e-05, "loss": 2.1027, "step": 2070 }, { "epoch": 13.405840886203425, "grad_norm": 1.3521157503128052, "learning_rate": 1.7504665359396255e-05, "loss": 2.1038, "step": 2080 }, { "epoch": 13.47029204431017, "grad_norm": 1.3395577669143677, "learning_rate": 1.7476394111748262e-05, "loss": 2.0887, "step": 2090 }, { "epoch": 13.534743202416918, "grad_norm": 1.427236795425415, "learning_rate": 1.7447986719742708e-05, "loss": 2.1054, "step": 2100 }, { "epoch": 13.534743202416918, "eval_loss": 2.1201364994049072, "eval_runtime": 19.5957, "eval_samples_per_second": 81.906, "eval_steps_per_second": 5.154, "step": 2100 }, { "epoch": 13.599194360523665, "grad_norm": 1.7869354486465454, "learning_rate": 1.7419443700675248e-05, "loss": 2.0844, "step": 2110 }, { "epoch": 13.663645518630412, "grad_norm": 1.4408442974090576, "learning_rate": 1.7390765574311287e-05, "loss": 2.1039, "step": 2120 }, { "epoch": 13.72809667673716, "grad_norm": 1.5654326677322388, "learning_rate": 1.7361952862876505e-05, "loss": 2.0889, "step": 2130 }, { "epoch": 13.792547834843907, "grad_norm": 1.5477581024169922, "learning_rate": 1.7333006091047386e-05, "loss": 2.0645, "step": 2140 }, { "epoch": 13.856998992950654, "grad_norm": 1.3517593145370483, "learning_rate": 1.730392578594162e-05, "loss": 2.0899, "step": 2150 }, { "epoch": 13.921450151057401, "grad_norm": 1.4705018997192383, "learning_rate": 1.7274712477108538e-05, "loss": 2.0876, "step": 2160 }, { "epoch": 13.985901309164149, "grad_norm": 1.5158582925796509, "learning_rate": 1.7245366696519448e-05, "loss": 2.0757, "step": 2170 }, { "epoch": 14.050352467270896, "grad_norm": 1.5032602548599243, "learning_rate": 1.7215888978557953e-05, "loss": 2.2535, "step": 2180 }, { "epoch": 14.114803625377643, "grad_norm": 1.3072115182876587, "learning_rate": 1.7186279860010228e-05, "loss": 2.0597, "step": 2190 }, { "epoch": 14.17925478348439, "grad_norm": 1.522055745124817, "learning_rate": 1.7156539880055236e-05, "loss": 2.0326, "step": 2200 }, { "epoch": 14.17925478348439, "eval_loss": 2.0740458965301514, "eval_runtime": 19.8046, "eval_samples_per_second": 81.042, "eval_steps_per_second": 5.1, "step": 2200 }, { "epoch": 14.243705941591138, "grad_norm": 1.2756038904190063, "learning_rate": 1.7126669580254908e-05, "loss": 2.0255, "step": 2210 }, { "epoch": 14.308157099697885, "grad_norm": 1.6488431692123413, "learning_rate": 1.7096669504544293e-05, "loss": 2.0271, "step": 2220 }, { "epoch": 14.372608257804632, "grad_norm": 1.4515522718429565, "learning_rate": 1.706654019922164e-05, "loss": 2.055, "step": 2230 }, { "epoch": 14.43705941591138, "grad_norm": 1.530522108078003, "learning_rate": 1.7036282212938468e-05, "loss": 2.0461, "step": 2240 }, { "epoch": 14.501510574018127, "grad_norm": 1.4453781843185425, "learning_rate": 1.7005896096689544e-05, "loss": 2.0376, "step": 2250 }, { "epoch": 14.565961732124874, "grad_norm": 1.578723430633545, "learning_rate": 1.697538240380288e-05, "loss": 2.0439, "step": 2260 }, { "epoch": 14.630412890231622, "grad_norm": 1.3801120519638062, "learning_rate": 1.6944741689929646e-05, "loss": 2.0133, "step": 2270 }, { "epoch": 14.694864048338369, "grad_norm": 1.2451757192611694, "learning_rate": 1.6913974513034046e-05, "loss": 2.014, "step": 2280 }, { "epoch": 14.759315206445116, "grad_norm": 1.3197917938232422, "learning_rate": 1.6883081433383163e-05, "loss": 2.0191, "step": 2290 }, { "epoch": 14.823766364551863, "grad_norm": 1.5146923065185547, "learning_rate": 1.6852063013536765e-05, "loss": 1.992, "step": 2300 }, { "epoch": 14.823766364551863, "eval_loss": 2.0283043384552, "eval_runtime": 20.2251, "eval_samples_per_second": 79.357, "eval_steps_per_second": 4.994, "step": 2300 }, { "epoch": 14.88821752265861, "grad_norm": 1.3689446449279785, "learning_rate": 1.6820919818337035e-05, "loss": 1.9953, "step": 2310 }, { "epoch": 14.952668680765358, "grad_norm": 1.3700356483459473, "learning_rate": 1.6789652414898315e-05, "loss": 2.0154, "step": 2320 }, { "epoch": 15.017119838872105, "grad_norm": 1.4734395742416382, "learning_rate": 1.6758261372596768e-05, "loss": 2.1667, "step": 2330 }, { "epoch": 15.081570996978853, "grad_norm": 1.5056065320968628, "learning_rate": 1.6726747263059996e-05, "loss": 1.9786, "step": 2340 }, { "epoch": 15.1460221550856, "grad_norm": 1.51126229763031, "learning_rate": 1.6695110660156652e-05, "loss": 1.9809, "step": 2350 }, { "epoch": 15.210473313192347, "grad_norm": 1.5719977617263794, "learning_rate": 1.6663352139985977e-05, "loss": 1.9794, "step": 2360 }, { "epoch": 15.274924471299094, "grad_norm": 1.413379192352295, "learning_rate": 1.6631472280867314e-05, "loss": 1.9688, "step": 2370 }, { "epoch": 15.339375629405842, "grad_norm": 1.4571571350097656, "learning_rate": 1.6599471663329577e-05, "loss": 1.9489, "step": 2380 }, { "epoch": 15.403826787512589, "grad_norm": 1.4078450202941895, "learning_rate": 1.656735087010067e-05, "loss": 1.9597, "step": 2390 }, { "epoch": 15.468277945619334, "grad_norm": 1.5099396705627441, "learning_rate": 1.653511048609689e-05, "loss": 1.9692, "step": 2400 }, { "epoch": 15.468277945619334, "eval_loss": 1.986576795578003, "eval_runtime": 20.2391, "eval_samples_per_second": 79.302, "eval_steps_per_second": 4.99, "step": 2400 }, { "epoch": 15.532729103726084, "grad_norm": 1.317844271659851, "learning_rate": 1.6502751098412282e-05, "loss": 1.9565, "step": 2410 }, { "epoch": 15.597180261832829, "grad_norm": 1.5470753908157349, "learning_rate": 1.6470273296307907e-05, "loss": 1.9569, "step": 2420 }, { "epoch": 15.661631419939576, "grad_norm": 1.732040524482727, "learning_rate": 1.643767767120117e-05, "loss": 1.9452, "step": 2430 }, { "epoch": 15.726082578046324, "grad_norm": 1.4074490070343018, "learning_rate": 1.6404964816654993e-05, "loss": 1.9543, "step": 2440 }, { "epoch": 15.79053373615307, "grad_norm": 1.6578551530838013, "learning_rate": 1.6372135328367058e-05, "loss": 1.9372, "step": 2450 }, { "epoch": 15.854984894259818, "grad_norm": 1.514443039894104, "learning_rate": 1.6339189804158922e-05, "loss": 1.9374, "step": 2460 }, { "epoch": 15.919436052366565, "grad_norm": 1.6120061874389648, "learning_rate": 1.630612884396515e-05, "loss": 1.962, "step": 2470 }, { "epoch": 15.983887210473313, "grad_norm": 1.9274513721466064, "learning_rate": 1.6272953049822376e-05, "loss": 1.9457, "step": 2480 }, { "epoch": 16.048338368580062, "grad_norm": 1.4026292562484741, "learning_rate": 1.6239663025858356e-05, "loss": 2.1124, "step": 2490 }, { "epoch": 16.112789526686807, "grad_norm": 1.543957233428955, "learning_rate": 1.6206259378280956e-05, "loss": 1.9235, "step": 2500 }, { "epoch": 16.112789526686807, "eval_loss": 1.9452571868896484, "eval_runtime": 20.1977, "eval_samples_per_second": 79.465, "eval_steps_per_second": 5.001, "step": 2500 }, { "epoch": 16.177240684793556, "grad_norm": 1.6722089052200317, "learning_rate": 1.6172742715367124e-05, "loss": 1.8982, "step": 2510 }, { "epoch": 16.241691842900302, "grad_norm": 1.649903416633606, "learning_rate": 1.613911364745179e-05, "loss": 1.9176, "step": 2520 }, { "epoch": 16.30614300100705, "grad_norm": 1.3199961185455322, "learning_rate": 1.6105372786916776e-05, "loss": 1.9226, "step": 2530 }, { "epoch": 16.370594159113796, "grad_norm": 1.513606309890747, "learning_rate": 1.607152074817964e-05, "loss": 1.9141, "step": 2540 }, { "epoch": 16.435045317220546, "grad_norm": 1.3551160097122192, "learning_rate": 1.6037558147682473e-05, "loss": 1.9051, "step": 2550 }, { "epoch": 16.49949647532729, "grad_norm": 1.5009561777114868, "learning_rate": 1.60034856038807e-05, "loss": 1.9184, "step": 2560 }, { "epoch": 16.56394763343404, "grad_norm": 1.4896663427352905, "learning_rate": 1.5969303737231786e-05, "loss": 1.9005, "step": 2570 }, { "epoch": 16.628398791540786, "grad_norm": 1.4002718925476074, "learning_rate": 1.593501317018396e-05, "loss": 1.9057, "step": 2580 }, { "epoch": 16.69284994964753, "grad_norm": 1.5425680875778198, "learning_rate": 1.5900614527164876e-05, "loss": 1.8948, "step": 2590 }, { "epoch": 16.75730110775428, "grad_norm": 1.4177629947662354, "learning_rate": 1.586610843457024e-05, "loss": 1.8748, "step": 2600 }, { "epoch": 16.75730110775428, "eval_loss": 1.913808822631836, "eval_runtime": 19.5797, "eval_samples_per_second": 81.973, "eval_steps_per_second": 5.158, "step": 2600 }, { "epoch": 16.821752265861026, "grad_norm": 1.6956419944763184, "learning_rate": 1.5831495520752395e-05, "loss": 1.8708, "step": 2610 }, { "epoch": 16.886203423967775, "grad_norm": 1.5349304676055908, "learning_rate": 1.5796776416008897e-05, "loss": 1.8814, "step": 2620 }, { "epoch": 16.95065458207452, "grad_norm": 1.4432286024093628, "learning_rate": 1.5761951752571032e-05, "loss": 1.8622, "step": 2630 }, { "epoch": 17.01510574018127, "grad_norm": 1.4395033121109009, "learning_rate": 1.5727022164592282e-05, "loss": 2.0162, "step": 2640 }, { "epoch": 17.079556898288015, "grad_norm": 1.451162338256836, "learning_rate": 1.569198828813681e-05, "loss": 1.8803, "step": 2650 }, { "epoch": 17.144008056394764, "grad_norm": 1.6109211444854736, "learning_rate": 1.5656850761167848e-05, "loss": 1.8511, "step": 2660 }, { "epoch": 17.20845921450151, "grad_norm": 1.504939079284668, "learning_rate": 1.562161022353611e-05, "loss": 1.8727, "step": 2670 }, { "epoch": 17.27291037260826, "grad_norm": 1.2910106182098389, "learning_rate": 1.55862673169681e-05, "loss": 1.859, "step": 2680 }, { "epoch": 17.337361530715004, "grad_norm": 1.4325050115585327, "learning_rate": 1.5550822685054475e-05, "loss": 1.874, "step": 2690 }, { "epoch": 17.401812688821753, "grad_norm": 1.5667049884796143, "learning_rate": 1.5515276973238286e-05, "loss": 1.8678, "step": 2700 }, { "epoch": 17.401812688821753, "eval_loss": 1.879951000213623, "eval_runtime": 21.041, "eval_samples_per_second": 76.28, "eval_steps_per_second": 4.8, "step": 2700 }, { "epoch": 17.4662638469285, "grad_norm": 1.5697641372680664, "learning_rate": 1.5479630828803235e-05, "loss": 1.8341, "step": 2710 }, { "epoch": 17.530715005035248, "grad_norm": 1.5967832803726196, "learning_rate": 1.5443884900861904e-05, "loss": 1.8426, "step": 2720 }, { "epoch": 17.595166163141993, "grad_norm": 1.2664175033569336, "learning_rate": 1.5408039840343903e-05, "loss": 1.8516, "step": 2730 }, { "epoch": 17.659617321248742, "grad_norm": 1.5191247463226318, "learning_rate": 1.5372096299984064e-05, "loss": 1.8435, "step": 2740 }, { "epoch": 17.724068479355488, "grad_norm": 1.5485187768936157, "learning_rate": 1.5336054934310502e-05, "loss": 1.8551, "step": 2750 }, { "epoch": 17.788519637462237, "grad_norm": 1.3325048685073853, "learning_rate": 1.5299916399632726e-05, "loss": 1.8329, "step": 2760 }, { "epoch": 17.852970795568982, "grad_norm": 1.4892091751098633, "learning_rate": 1.5263681354029694e-05, "loss": 1.8275, "step": 2770 }, { "epoch": 17.91742195367573, "grad_norm": 1.3611677885055542, "learning_rate": 1.5227350457337809e-05, "loss": 1.8343, "step": 2780 }, { "epoch": 17.981873111782477, "grad_norm": 1.3932119607925415, "learning_rate": 1.5190924371138908e-05, "loss": 1.8319, "step": 2790 }, { "epoch": 18.046324269889226, "grad_norm": 1.4689136743545532, "learning_rate": 1.5154403758748228e-05, "loss": 1.9983, "step": 2800 }, { "epoch": 18.046324269889226, "eval_loss": 1.8525444269180298, "eval_runtime": 19.7876, "eval_samples_per_second": 81.111, "eval_steps_per_second": 5.104, "step": 2800 }, { "epoch": 18.11077542799597, "grad_norm": 1.7211060523986816, "learning_rate": 1.5117789285202313e-05, "loss": 1.8104, "step": 2810 }, { "epoch": 18.17522658610272, "grad_norm": 1.6318062543869019, "learning_rate": 1.5081081617246912e-05, "loss": 1.821, "step": 2820 }, { "epoch": 18.239677744209466, "grad_norm": 1.2402304410934448, "learning_rate": 1.5044281423324826e-05, "loss": 1.8104, "step": 2830 }, { "epoch": 18.304128902316215, "grad_norm": 1.438472032546997, "learning_rate": 1.500738937356376e-05, "loss": 1.8259, "step": 2840 }, { "epoch": 18.36858006042296, "grad_norm": 1.4083638191223145, "learning_rate": 1.4970406139764092e-05, "loss": 1.808, "step": 2850 }, { "epoch": 18.43303121852971, "grad_norm": 1.2466082572937012, "learning_rate": 1.4933332395386652e-05, "loss": 1.8232, "step": 2860 }, { "epoch": 18.497482376636455, "grad_norm": 1.2091064453125, "learning_rate": 1.4896168815540464e-05, "loss": 1.812, "step": 2870 }, { "epoch": 18.561933534743204, "grad_norm": 1.6740643978118896, "learning_rate": 1.4858916076970444e-05, "loss": 1.7973, "step": 2880 }, { "epoch": 18.62638469284995, "grad_norm": 1.2571892738342285, "learning_rate": 1.4821574858045073e-05, "loss": 1.8096, "step": 2890 }, { "epoch": 18.690835850956695, "grad_norm": 1.3580372333526611, "learning_rate": 1.4784145838744067e-05, "loss": 1.8117, "step": 2900 }, { "epoch": 18.690835850956695, "eval_loss": 1.8296641111373901, "eval_runtime": 22.6507, "eval_samples_per_second": 70.859, "eval_steps_per_second": 4.459, "step": 2900 }, { "epoch": 18.755287009063444, "grad_norm": 1.4074209928512573, "learning_rate": 1.4746629700645955e-05, "loss": 1.8154, "step": 2910 }, { "epoch": 18.81973816717019, "grad_norm": 1.4824714660644531, "learning_rate": 1.470902712691571e-05, "loss": 1.8, "step": 2920 }, { "epoch": 18.88418932527694, "grad_norm": 1.4426990747451782, "learning_rate": 1.4671338802292274e-05, "loss": 1.7956, "step": 2930 }, { "epoch": 18.948640483383684, "grad_norm": 1.4474570751190186, "learning_rate": 1.4633565413076114e-05, "loss": 1.7948, "step": 2940 }, { "epoch": 19.013091641490433, "grad_norm": 1.2493271827697754, "learning_rate": 1.4595707647116713e-05, "loss": 1.9644, "step": 2950 }, { "epoch": 19.07754279959718, "grad_norm": 1.4267578125, "learning_rate": 1.4557766193800036e-05, "loss": 1.781, "step": 2960 }, { "epoch": 19.141993957703928, "grad_norm": 1.355020523071289, "learning_rate": 1.4519741744036e-05, "loss": 1.7878, "step": 2970 }, { "epoch": 19.206445115810673, "grad_norm": 1.3975870609283447, "learning_rate": 1.4481634990245871e-05, "loss": 1.7899, "step": 2980 }, { "epoch": 19.270896273917423, "grad_norm": 1.426984190940857, "learning_rate": 1.4443446626349662e-05, "loss": 1.775, "step": 2990 }, { "epoch": 19.335347432024168, "grad_norm": 1.5936192274093628, "learning_rate": 1.4405177347753503e-05, "loss": 1.7697, "step": 3000 }, { "epoch": 19.335347432024168, "eval_loss": 1.8065377473831177, "eval_runtime": 20.2136, "eval_samples_per_second": 79.402, "eval_steps_per_second": 4.997, "step": 3000 }, { "epoch": 19.399798590130917, "grad_norm": 1.3064271211624146, "learning_rate": 1.4366827851336964e-05, "loss": 1.7844, "step": 3010 }, { "epoch": 19.464249748237663, "grad_norm": 1.2223880290985107, "learning_rate": 1.4328398835440381e-05, "loss": 1.7739, "step": 3020 }, { "epoch": 19.52870090634441, "grad_norm": 1.5116280317306519, "learning_rate": 1.4289890999852126e-05, "loss": 1.7707, "step": 3030 }, { "epoch": 19.593152064451157, "grad_norm": 1.3726651668548584, "learning_rate": 1.4251305045795874e-05, "loss": 1.7789, "step": 3040 }, { "epoch": 19.657603222557906, "grad_norm": 1.4748300313949585, "learning_rate": 1.4212641675917823e-05, "loss": 1.7715, "step": 3050 }, { "epoch": 19.72205438066465, "grad_norm": 1.4205374717712402, "learning_rate": 1.4173901594273917e-05, "loss": 1.7823, "step": 3060 }, { "epoch": 19.7865055387714, "grad_norm": 1.2562229633331299, "learning_rate": 1.4135085506316997e-05, "loss": 1.7643, "step": 3070 }, { "epoch": 19.850956696878146, "grad_norm": 1.3158072233200073, "learning_rate": 1.4096194118883982e-05, "loss": 1.7599, "step": 3080 }, { "epoch": 19.915407854984895, "grad_norm": 1.3303531408309937, "learning_rate": 1.4057228140182982e-05, "loss": 1.7701, "step": 3090 }, { "epoch": 19.97985901309164, "grad_norm": 1.4447650909423828, "learning_rate": 1.4018188279780412e-05, "loss": 1.7587, "step": 3100 }, { "epoch": 19.97985901309164, "eval_loss": 1.7869038581848145, "eval_runtime": 19.8118, "eval_samples_per_second": 81.012, "eval_steps_per_second": 5.098, "step": 3100 }, { "epoch": 20.04431017119839, "grad_norm": 1.4862005710601807, "learning_rate": 1.3979075248588054e-05, "loss": 1.9169, "step": 3110 }, { "epoch": 20.108761329305135, "grad_norm": 1.2900787591934204, "learning_rate": 1.3939889758850138e-05, "loss": 1.7603, "step": 3120 }, { "epoch": 20.173212487411885, "grad_norm": 1.1818557977676392, "learning_rate": 1.3900632524130343e-05, "loss": 1.7501, "step": 3130 }, { "epoch": 20.23766364551863, "grad_norm": 1.4612010717391968, "learning_rate": 1.3861304259298823e-05, "loss": 1.7412, "step": 3140 }, { "epoch": 20.30211480362538, "grad_norm": 1.2869311571121216, "learning_rate": 1.3821905680519181e-05, "loss": 1.7442, "step": 3150 }, { "epoch": 20.366565961732125, "grad_norm": 1.4396709203720093, "learning_rate": 1.378243750523543e-05, "loss": 1.7634, "step": 3160 }, { "epoch": 20.431017119838874, "grad_norm": 1.2522846460342407, "learning_rate": 1.3742900452158932e-05, "loss": 1.7422, "step": 3170 }, { "epoch": 20.49546827794562, "grad_norm": 1.4543647766113281, "learning_rate": 1.3703295241255296e-05, "loss": 1.75, "step": 3180 }, { "epoch": 20.55991943605237, "grad_norm": 1.191945195198059, "learning_rate": 1.3663622593731294e-05, "loss": 1.7302, "step": 3190 }, { "epoch": 20.624370594159114, "grad_norm": 1.4461244344711304, "learning_rate": 1.3623883232021693e-05, "loss": 1.7381, "step": 3200 }, { "epoch": 20.624370594159114, "eval_loss": 1.77000892162323, "eval_runtime": 19.6254, "eval_samples_per_second": 81.782, "eval_steps_per_second": 5.146, "step": 3200 }, { "epoch": 20.68882175226586, "grad_norm": 1.2070411443710327, "learning_rate": 1.3584077879776132e-05, "loss": 1.739, "step": 3210 }, { "epoch": 20.75327291037261, "grad_norm": 1.3720213174819946, "learning_rate": 1.3544207261845928e-05, "loss": 1.7366, "step": 3220 }, { "epoch": 20.817724068479354, "grad_norm": 1.3589030504226685, "learning_rate": 1.3504272104270876e-05, "loss": 1.7427, "step": 3230 }, { "epoch": 20.882175226586103, "grad_norm": 1.1105351448059082, "learning_rate": 1.3464273134266037e-05, "loss": 1.7487, "step": 3240 }, { "epoch": 20.94662638469285, "grad_norm": 1.2116628885269165, "learning_rate": 1.3424211080208478e-05, "loss": 1.7388, "step": 3250 }, { "epoch": 21.011077542799597, "grad_norm": 1.281503438949585, "learning_rate": 1.338408667162404e-05, "loss": 1.8821, "step": 3260 }, { "epoch": 21.075528700906343, "grad_norm": 1.3089734315872192, "learning_rate": 1.3343900639174007e-05, "loss": 1.7235, "step": 3270 }, { "epoch": 21.139979859013092, "grad_norm": 1.1901700496673584, "learning_rate": 1.3303653714641853e-05, "loss": 1.7281, "step": 3280 }, { "epoch": 21.204431017119838, "grad_norm": 1.307697057723999, "learning_rate": 1.3263346630919875e-05, "loss": 1.7273, "step": 3290 }, { "epoch": 21.268882175226587, "grad_norm": 1.1000251770019531, "learning_rate": 1.3222980121995867e-05, "loss": 1.7264, "step": 3300 }, { "epoch": 21.268882175226587, "eval_loss": 1.7537308931350708, "eval_runtime": 20.2292, "eval_samples_per_second": 79.341, "eval_steps_per_second": 4.993, "step": 3300 }, { "epoch": 21.333333333333332, "grad_norm": 1.1835920810699463, "learning_rate": 1.3182554922939748e-05, "loss": 1.7096, "step": 3310 }, { "epoch": 21.39778449144008, "grad_norm": 1.3662201166152954, "learning_rate": 1.3142071769890182e-05, "loss": 1.716, "step": 3320 }, { "epoch": 21.462235649546827, "grad_norm": 1.1962809562683105, "learning_rate": 1.3101531400041163e-05, "loss": 1.7298, "step": 3330 }, { "epoch": 21.526686807653576, "grad_norm": 1.1254611015319824, "learning_rate": 1.3060934551628603e-05, "loss": 1.7316, "step": 3340 }, { "epoch": 21.59113796576032, "grad_norm": 1.4266749620437622, "learning_rate": 1.3020281963916883e-05, "loss": 1.7149, "step": 3350 }, { "epoch": 21.65558912386707, "grad_norm": 1.2904185056686401, "learning_rate": 1.2979574377185385e-05, "loss": 1.7324, "step": 3360 }, { "epoch": 21.720040281973816, "grad_norm": 1.357013463973999, "learning_rate": 1.293881253271502e-05, "loss": 1.7155, "step": 3370 }, { "epoch": 21.784491440080565, "grad_norm": 1.285480260848999, "learning_rate": 1.289799717277473e-05, "loss": 1.724, "step": 3380 }, { "epoch": 21.84894259818731, "grad_norm": 1.2539184093475342, "learning_rate": 1.2857129040607963e-05, "loss": 1.7297, "step": 3390 }, { "epoch": 21.91339375629406, "grad_norm": 1.2335858345031738, "learning_rate": 1.281620888041915e-05, "loss": 1.7112, "step": 3400 }, { "epoch": 21.91339375629406, "eval_loss": 1.738411784172058, "eval_runtime": 19.8069, "eval_samples_per_second": 81.032, "eval_steps_per_second": 5.099, "step": 3400 }, { "epoch": 21.977844914400805, "grad_norm": 1.2766244411468506, "learning_rate": 1.2775237437360137e-05, "loss": 1.6879, "step": 3410 }, { "epoch": 22.042296072507554, "grad_norm": 1.1971969604492188, "learning_rate": 1.2734215457516639e-05, "loss": 1.8451, "step": 3420 }, { "epoch": 22.1067472306143, "grad_norm": 1.2040929794311523, "learning_rate": 1.269314368789463e-05, "loss": 1.7007, "step": 3430 }, { "epoch": 22.17119838872105, "grad_norm": 1.2112030982971191, "learning_rate": 1.2652022876406756e-05, "loss": 1.7094, "step": 3440 }, { "epoch": 22.235649546827794, "grad_norm": 1.2944767475128174, "learning_rate": 1.2610853771858702e-05, "loss": 1.69, "step": 3450 }, { "epoch": 22.300100704934543, "grad_norm": 1.3381768465042114, "learning_rate": 1.2569637123935581e-05, "loss": 1.7046, "step": 3460 }, { "epoch": 22.36455186304129, "grad_norm": 1.205153465270996, "learning_rate": 1.2528373683188247e-05, "loss": 1.7066, "step": 3470 }, { "epoch": 22.429003021148038, "grad_norm": 1.2156826257705688, "learning_rate": 1.248706420101966e-05, "loss": 1.7052, "step": 3480 }, { "epoch": 22.493454179254783, "grad_norm": 1.2174049615859985, "learning_rate": 1.2445709429671184e-05, "loss": 1.688, "step": 3490 }, { "epoch": 22.557905337361532, "grad_norm": 1.1301804780960083, "learning_rate": 1.2404310122208895e-05, "loss": 1.7036, "step": 3500 }, { "epoch": 22.557905337361532, "eval_loss": 1.7260056734085083, "eval_runtime": 20.2022, "eval_samples_per_second": 79.447, "eval_steps_per_second": 4.999, "step": 3500 }, { "epoch": 22.622356495468278, "grad_norm": 1.2046083211898804, "learning_rate": 1.2362867032509871e-05, "loss": 1.7096, "step": 3510 }, { "epoch": 22.686807653575023, "grad_norm": 1.3079477548599243, "learning_rate": 1.2321380915248446e-05, "loss": 1.7013, "step": 3520 }, { "epoch": 22.751258811681772, "grad_norm": 1.0908950567245483, "learning_rate": 1.2279852525882504e-05, "loss": 1.6883, "step": 3530 }, { "epoch": 22.815709969788518, "grad_norm": 1.3112512826919556, "learning_rate": 1.2238282620639677e-05, "loss": 1.7084, "step": 3540 }, { "epoch": 22.880161127895267, "grad_norm": 1.1353329420089722, "learning_rate": 1.2196671956503611e-05, "loss": 1.6871, "step": 3550 }, { "epoch": 22.944612286002013, "grad_norm": 1.1687755584716797, "learning_rate": 1.2155021291200161e-05, "loss": 1.6921, "step": 3560 }, { "epoch": 23.00906344410876, "grad_norm": 1.0935124158859253, "learning_rate": 1.2113331383183607e-05, "loss": 1.8299, "step": 3570 }, { "epoch": 23.073514602215507, "grad_norm": 1.1443811655044556, "learning_rate": 1.2071602991622822e-05, "loss": 1.695, "step": 3580 }, { "epoch": 23.137965760322256, "grad_norm": 1.0561773777008057, "learning_rate": 1.202983687638747e-05, "loss": 1.6879, "step": 3590 }, { "epoch": 23.202416918429, "grad_norm": 1.1481385231018066, "learning_rate": 1.198803379803416e-05, "loss": 1.6766, "step": 3600 }, { "epoch": 23.202416918429, "eval_loss": 1.7131911516189575, "eval_runtime": 20.2073, "eval_samples_per_second": 79.427, "eval_steps_per_second": 4.998, "step": 3600 }, { "epoch": 23.26686807653575, "grad_norm": 1.1637824773788452, "learning_rate": 1.1946194517792584e-05, "loss": 1.6887, "step": 3610 }, { "epoch": 23.331319234642496, "grad_norm": 1.168934941291809, "learning_rate": 1.190431979755168e-05, "loss": 1.6692, "step": 3620 }, { "epoch": 23.395770392749245, "grad_norm": 1.1618574857711792, "learning_rate": 1.1862410399845739e-05, "loss": 1.6696, "step": 3630 }, { "epoch": 23.46022155085599, "grad_norm": 1.118910312652588, "learning_rate": 1.1820467087840526e-05, "loss": 1.6804, "step": 3640 }, { "epoch": 23.52467270896274, "grad_norm": 1.1120346784591675, "learning_rate": 1.1778490625319376e-05, "loss": 1.6863, "step": 3650 }, { "epoch": 23.589123867069485, "grad_norm": 1.1935482025146484, "learning_rate": 1.1736481776669307e-05, "loss": 1.6881, "step": 3660 }, { "epoch": 23.653575025176234, "grad_norm": 1.1648095846176147, "learning_rate": 1.1694441306867062e-05, "loss": 1.6813, "step": 3670 }, { "epoch": 23.71802618328298, "grad_norm": 1.0669279098510742, "learning_rate": 1.1652369981465218e-05, "loss": 1.6737, "step": 3680 }, { "epoch": 23.78247734138973, "grad_norm": 1.2142657041549683, "learning_rate": 1.1610268566578233e-05, "loss": 1.6825, "step": 3690 }, { "epoch": 23.846928499496475, "grad_norm": 1.05051851272583, "learning_rate": 1.1568137828868478e-05, "loss": 1.6606, "step": 3700 }, { "epoch": 23.846928499496475, "eval_loss": 1.7030967473983765, "eval_runtime": 20.2178, "eval_samples_per_second": 79.385, "eval_steps_per_second": 4.996, "step": 3700 }, { "epoch": 23.911379657603224, "grad_norm": 1.1508402824401855, "learning_rate": 1.15259785355323e-05, "loss": 1.6664, "step": 3710 }, { "epoch": 23.97583081570997, "grad_norm": 1.260407567024231, "learning_rate": 1.1483791454286027e-05, "loss": 1.6875, "step": 3720 }, { "epoch": 24.040281973816718, "grad_norm": 1.032926082611084, "learning_rate": 1.1441577353352023e-05, "loss": 1.7966, "step": 3730 }, { "epoch": 24.104733131923464, "grad_norm": 1.0912493467330933, "learning_rate": 1.1399337001444658e-05, "loss": 1.6737, "step": 3740 }, { "epoch": 24.169184290030213, "grad_norm": 1.026868462562561, "learning_rate": 1.1357071167756341e-05, "loss": 1.657, "step": 3750 }, { "epoch": 24.23363544813696, "grad_norm": 1.212310552597046, "learning_rate": 1.13147806219435e-05, "loss": 1.673, "step": 3760 }, { "epoch": 24.298086606243707, "grad_norm": 1.0902752876281738, "learning_rate": 1.1272466134112562e-05, "loss": 1.6793, "step": 3770 }, { "epoch": 24.362537764350453, "grad_norm": 1.1671861410140991, "learning_rate": 1.1230128474805948e-05, "loss": 1.664, "step": 3780 }, { "epoch": 24.426988922457202, "grad_norm": 1.0966593027114868, "learning_rate": 1.1187768414988015e-05, "loss": 1.6649, "step": 3790 }, { "epoch": 24.491440080563947, "grad_norm": 1.0915180444717407, "learning_rate": 1.114538672603104e-05, "loss": 1.6749, "step": 3800 }, { "epoch": 24.491440080563947, "eval_loss": 1.6930159330368042, "eval_runtime": 19.486, "eval_samples_per_second": 82.367, "eval_steps_per_second": 5.183, "step": 3800 }, { "epoch": 24.555891238670696, "grad_norm": 1.0787924528121948, "learning_rate": 1.1102984179701157e-05, "loss": 1.659, "step": 3810 }, { "epoch": 24.620342396777442, "grad_norm": 1.0481791496276855, "learning_rate": 1.1060561548144321e-05, "loss": 1.6558, "step": 3820 }, { "epoch": 24.68479355488419, "grad_norm": 1.06913161277771, "learning_rate": 1.1018119603872228e-05, "loss": 1.6551, "step": 3830 }, { "epoch": 24.749244712990937, "grad_norm": 0.9699168801307678, "learning_rate": 1.0975659119748265e-05, "loss": 1.6579, "step": 3840 }, { "epoch": 24.813695871097686, "grad_norm": 1.0122530460357666, "learning_rate": 1.0933180868973414e-05, "loss": 1.6517, "step": 3850 }, { "epoch": 24.87814702920443, "grad_norm": 0.935750424861908, "learning_rate": 1.08906856250722e-05, "loss": 1.6567, "step": 3860 }, { "epoch": 24.942598187311177, "grad_norm": 0.929023027420044, "learning_rate": 1.0848174161878584e-05, "loss": 1.6501, "step": 3870 }, { "epoch": 25.007049345417926, "grad_norm": 1.1005076169967651, "learning_rate": 1.080564725352188e-05, "loss": 1.8029, "step": 3880 }, { "epoch": 25.07150050352467, "grad_norm": 0.9921239018440247, "learning_rate": 1.076310567441266e-05, "loss": 1.6426, "step": 3890 }, { "epoch": 25.13595166163142, "grad_norm": 1.0867966413497925, "learning_rate": 1.072055019922864e-05, "loss": 1.6545, "step": 3900 }, { "epoch": 25.13595166163142, "eval_loss": 1.6841835975646973, "eval_runtime": 19.9451, "eval_samples_per_second": 80.471, "eval_steps_per_second": 5.064, "step": 3900 }, { "epoch": 25.200402819738166, "grad_norm": 0.9666146636009216, "learning_rate": 1.067798160290059e-05, "loss": 1.6428, "step": 3910 }, { "epoch": 25.264853977844915, "grad_norm": 1.1212939023971558, "learning_rate": 1.0635400660598214e-05, "loss": 1.6361, "step": 3920 }, { "epoch": 25.32930513595166, "grad_norm": 1.075994610786438, "learning_rate": 1.0592808147716032e-05, "loss": 1.6567, "step": 3930 }, { "epoch": 25.39375629405841, "grad_norm": 0.9849887490272522, "learning_rate": 1.0550204839859265e-05, "loss": 1.6587, "step": 3940 }, { "epoch": 25.458207452165155, "grad_norm": 0.932995617389679, "learning_rate": 1.0507591512829707e-05, "loss": 1.6471, "step": 3950 }, { "epoch": 25.522658610271904, "grad_norm": 1.0005803108215332, "learning_rate": 1.0464968942611608e-05, "loss": 1.6508, "step": 3960 }, { "epoch": 25.58710976837865, "grad_norm": 1.2026358842849731, "learning_rate": 1.0422337905357523e-05, "loss": 1.6506, "step": 3970 }, { "epoch": 25.6515609264854, "grad_norm": 1.075043797492981, "learning_rate": 1.0379699177374199e-05, "loss": 1.6372, "step": 3980 }, { "epoch": 25.716012084592144, "grad_norm": 0.983676016330719, "learning_rate": 1.0337053535108427e-05, "loss": 1.6494, "step": 3990 }, { "epoch": 25.780463242698893, "grad_norm": 0.9707674384117126, "learning_rate": 1.0294401755132912e-05, "loss": 1.6509, "step": 4000 }, { "epoch": 25.780463242698893, "eval_loss": 1.6760313510894775, "eval_runtime": 19.7916, "eval_samples_per_second": 81.095, "eval_steps_per_second": 5.103, "step": 4000 }, { "epoch": 25.84491440080564, "grad_norm": 1.0340772867202759, "learning_rate": 1.0251744614132117e-05, "loss": 1.6614, "step": 4010 }, { "epoch": 25.909365558912388, "grad_norm": 0.9816955924034119, "learning_rate": 1.0209082888888143e-05, "loss": 1.6327, "step": 4020 }, { "epoch": 25.973816717019133, "grad_norm": 0.9904563426971436, "learning_rate": 1.0166417356266546e-05, "loss": 1.6408, "step": 4030 }, { "epoch": 26.038267875125882, "grad_norm": 0.9868249893188477, "learning_rate": 1.0123748793202242e-05, "loss": 1.7511, "step": 4040 }, { "epoch": 26.102719033232628, "grad_norm": 0.9166064262390137, "learning_rate": 1.0081077976685307e-05, "loss": 1.6361, "step": 4050 }, { "epoch": 26.167170191339377, "grad_norm": 0.9603624939918518, "learning_rate": 1.0038405683746868e-05, "loss": 1.6374, "step": 4060 }, { "epoch": 26.231621349446122, "grad_norm": 0.9980469346046448, "learning_rate": 9.995732691444932e-06, "loss": 1.6464, "step": 4070 }, { "epoch": 26.29607250755287, "grad_norm": 1.0243116617202759, "learning_rate": 9.953059776850238e-06, "loss": 1.6364, "step": 4080 }, { "epoch": 26.360523665659617, "grad_norm": 0.9727787971496582, "learning_rate": 9.910387717032115e-06, "loss": 1.6366, "step": 4090 }, { "epoch": 26.424974823766366, "grad_norm": 0.9883456826210022, "learning_rate": 9.86771728904433e-06, "loss": 1.6423, "step": 4100 }, { "epoch": 26.424974823766366, "eval_loss": 1.6685707569122314, "eval_runtime": 21.0152, "eval_samples_per_second": 76.373, "eval_steps_per_second": 4.806, "step": 4100 }, { "epoch": 26.48942598187311, "grad_norm": 0.9553434252738953, "learning_rate": 9.82504926991092e-06, "loss": 1.6439, "step": 4110 }, { "epoch": 26.55387713997986, "grad_norm": 1.038282036781311, "learning_rate": 9.782384436612072e-06, "loss": 1.641, "step": 4120 }, { "epoch": 26.618328298086606, "grad_norm": 1.0809876918792725, "learning_rate": 9.73972356606995e-06, "loss": 1.6335, "step": 4130 }, { "epoch": 26.682779456193355, "grad_norm": 1.0616182088851929, "learning_rate": 9.697067435134564e-06, "loss": 1.6168, "step": 4140 }, { "epoch": 26.7472306143001, "grad_norm": 0.8756535649299622, "learning_rate": 9.654416820569618e-06, "loss": 1.6203, "step": 4150 }, { "epoch": 26.81168177240685, "grad_norm": 1.1267576217651367, "learning_rate": 9.611772499038345e-06, "loss": 1.6279, "step": 4160 }, { "epoch": 26.876132930513595, "grad_norm": 0.9867852926254272, "learning_rate": 9.569135247089401e-06, "loss": 1.6323, "step": 4170 }, { "epoch": 26.94058408862034, "grad_norm": 0.9736083149909973, "learning_rate": 9.526505841142702e-06, "loss": 1.6328, "step": 4180 }, { "epoch": 27.00503524672709, "grad_norm": 1.8718211650848389, "learning_rate": 9.48388505747529e-06, "loss": 1.7769, "step": 4190 }, { "epoch": 27.069486404833835, "grad_norm": 0.9595508575439453, "learning_rate": 9.441273672207187e-06, "loss": 1.6209, "step": 4200 }, { "epoch": 27.069486404833835, "eval_loss": 1.6613610982894897, "eval_runtime": 20.2063, "eval_samples_per_second": 79.431, "eval_steps_per_second": 4.998, "step": 4200 } ], "logging_steps": 10, "max_steps": 7750, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3849836110034764e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }