{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8201261132418904, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00520054605733602, "grad_norm": 16.05304718017578, "learning_rate": 1.5570934256055363e-06, "loss": 1.3826, "step": 10 }, { "epoch": 0.01040109211467204, "grad_norm": 13.638071060180664, "learning_rate": 3.2871972318339097e-06, "loss": 1.2516, "step": 20 }, { "epoch": 0.015601638172008062, "grad_norm": 17.292724609375, "learning_rate": 5.017301038062284e-06, "loss": 1.1593, "step": 30 }, { "epoch": 0.02080218422934408, "grad_norm": 2.1085638999938965, "learning_rate": 6.747404844290659e-06, "loss": 0.5496, "step": 40 }, { "epoch": 0.0260027302866801, "grad_norm": 1.403855562210083, "learning_rate": 8.477508650519032e-06, "loss": 0.369, "step": 50 }, { "epoch": 0.031203276344016123, "grad_norm": 2.6436541080474854, "learning_rate": 1.0207612456747406e-05, "loss": 0.2841, "step": 60 }, { "epoch": 0.03640382240135214, "grad_norm": 1.9844225645065308, "learning_rate": 1.193771626297578e-05, "loss": 0.4041, "step": 70 }, { "epoch": 0.04160436845868816, "grad_norm": 2.442768096923828, "learning_rate": 1.3667820069204155e-05, "loss": 0.2672, "step": 80 }, { "epoch": 0.046804914516024185, "grad_norm": 2.2531392574310303, "learning_rate": 1.5397923875432525e-05, "loss": 0.3032, "step": 90 }, { "epoch": 0.0520054605733602, "grad_norm": 1.9605236053466797, "learning_rate": 1.7128027681660898e-05, "loss": 0.2662, "step": 100 }, { "epoch": 0.05720600663069622, "grad_norm": 2.8307950496673584, "learning_rate": 1.8858131487889273e-05, "loss": 0.2989, "step": 110 }, { "epoch": 0.062406552688032246, "grad_norm": 2.2501089572906494, "learning_rate": 2.058823529411765e-05, "loss": 0.2533, "step": 120 }, { "epoch": 0.06760709874536826, "grad_norm": 3.275035858154297, "learning_rate": 2.231833910034602e-05, "loss": 0.2911, "step": 130 }, { "epoch": 0.07280764480270428, "grad_norm": 3.080817699432373, "learning_rate": 2.4048442906574396e-05, "loss": 0.3063, "step": 140 }, { "epoch": 0.07800819086004031, "grad_norm": 5.425448894500732, "learning_rate": 2.5778546712802772e-05, "loss": 0.3093, "step": 150 }, { "epoch": 0.08320873691737632, "grad_norm": 1.7119687795639038, "learning_rate": 2.7508650519031144e-05, "loss": 0.2612, "step": 160 }, { "epoch": 0.08840928297471234, "grad_norm": 4.50128173828125, "learning_rate": 2.9238754325259516e-05, "loss": 0.2895, "step": 170 }, { "epoch": 0.09360982903204837, "grad_norm": 1.802933692932129, "learning_rate": 3.096885813148789e-05, "loss": 0.2085, "step": 180 }, { "epoch": 0.09881037508938438, "grad_norm": 2.629002571105957, "learning_rate": 3.269896193771627e-05, "loss": 0.2971, "step": 190 }, { "epoch": 0.1040109211467204, "grad_norm": 1.411960244178772, "learning_rate": 3.4429065743944636e-05, "loss": 0.2985, "step": 200 }, { "epoch": 0.10921146720405643, "grad_norm": 1.9607653617858887, "learning_rate": 3.615916955017301e-05, "loss": 0.2282, "step": 210 }, { "epoch": 0.11441201326139244, "grad_norm": 3.4696173667907715, "learning_rate": 3.788927335640138e-05, "loss": 0.259, "step": 220 }, { "epoch": 0.11961255931872847, "grad_norm": 3.1414554119110107, "learning_rate": 3.961937716262976e-05, "loss": 0.2353, "step": 230 }, { "epoch": 0.12481310537606449, "grad_norm": 1.7068389654159546, "learning_rate": 4.134948096885813e-05, "loss": 0.2279, "step": 240 }, { "epoch": 0.13001365143340052, "grad_norm": 2.7408318519592285, "learning_rate": 4.307958477508651e-05, "loss": 0.2809, "step": 250 }, { "epoch": 0.13521419749073652, "grad_norm": 3.036931276321411, "learning_rate": 4.480968858131488e-05, "loss": 0.253, "step": 260 }, { "epoch": 0.14041474354807254, "grad_norm": 2.8552465438842773, "learning_rate": 4.653979238754326e-05, "loss": 0.2454, "step": 270 }, { "epoch": 0.14561528960540857, "grad_norm": 3.6416499614715576, "learning_rate": 4.826989619377163e-05, "loss": 0.244, "step": 280 }, { "epoch": 0.1508158356627446, "grad_norm": 3.5004782676696777, "learning_rate": 5e-05, "loss": 0.2909, "step": 290 }, { "epoch": 0.15601638172008062, "grad_norm": 1.2734322547912598, "learning_rate": 4.999958918390321e-05, "loss": 0.33, "step": 300 }, { "epoch": 0.16121692777741664, "grad_norm": 5.017611026763916, "learning_rate": 4.999835674911443e-05, "loss": 0.2723, "step": 310 }, { "epoch": 0.16641747383475264, "grad_norm": 2.2255094051361084, "learning_rate": 4.999630273613799e-05, "loss": 0.2993, "step": 320 }, { "epoch": 0.17161801989208866, "grad_norm": 1.4226183891296387, "learning_rate": 4.9993427212479606e-05, "loss": 0.2749, "step": 330 }, { "epoch": 0.1768185659494247, "grad_norm": 1.295336127281189, "learning_rate": 4.998973027264419e-05, "loss": 0.2618, "step": 340 }, { "epoch": 0.1820191120067607, "grad_norm": 1.9380894899368286, "learning_rate": 4.998521203813274e-05, "loss": 0.2595, "step": 350 }, { "epoch": 0.18721965806409674, "grad_norm": 2.218477964401245, "learning_rate": 4.997987265743834e-05, "loss": 0.2305, "step": 360 }, { "epoch": 0.19242020412143276, "grad_norm": 2.5676722526550293, "learning_rate": 4.9973712306041256e-05, "loss": 0.2259, "step": 370 }, { "epoch": 0.19762075017876876, "grad_norm": 1.3287098407745361, "learning_rate": 4.996673118640323e-05, "loss": 0.2082, "step": 380 }, { "epoch": 0.2028212962361048, "grad_norm": 1.6181299686431885, "learning_rate": 4.995892952796074e-05, "loss": 0.2422, "step": 390 }, { "epoch": 0.2080218422934408, "grad_norm": 2.0212459564208984, "learning_rate": 4.995030758711756e-05, "loss": 0.296, "step": 400 }, { "epoch": 0.21322238835077684, "grad_norm": 1.0081758499145508, "learning_rate": 4.994086564723626e-05, "loss": 0.2289, "step": 410 }, { "epoch": 0.21842293440811286, "grad_norm": 1.7539795637130737, "learning_rate": 4.993060401862888e-05, "loss": 0.2118, "step": 420 }, { "epoch": 0.22362348046544886, "grad_norm": 1.8362935781478882, "learning_rate": 4.991952303854682e-05, "loss": 0.2198, "step": 430 }, { "epoch": 0.22882402652278488, "grad_norm": 3.820204734802246, "learning_rate": 4.9907623071169686e-05, "loss": 0.2744, "step": 440 }, { "epoch": 0.2340245725801209, "grad_norm": 2.739043951034546, "learning_rate": 4.9894904507593316e-05, "loss": 0.1887, "step": 450 }, { "epoch": 0.23922511863745693, "grad_norm": 1.1746188402175903, "learning_rate": 4.988136776581696e-05, "loss": 0.2105, "step": 460 }, { "epoch": 0.24442566469479296, "grad_norm": 4.921947956085205, "learning_rate": 4.9867013290729535e-05, "loss": 0.2306, "step": 470 }, { "epoch": 0.24962621075212899, "grad_norm": 4.337555408477783, "learning_rate": 4.9851841554095e-05, "loss": 0.2564, "step": 480 }, { "epoch": 0.254826756809465, "grad_norm": 3.977388620376587, "learning_rate": 4.9835853054536846e-05, "loss": 0.2106, "step": 490 }, { "epoch": 0.26002730286680104, "grad_norm": 1.2592873573303223, "learning_rate": 4.981904831752171e-05, "loss": 0.3106, "step": 500 }, { "epoch": 0.26002730286680104, "eval_loss": 0.2771838307380676, "eval_runtime": 136.7574, "eval_samples_per_second": 14.054, "eval_steps_per_second": 14.054, "step": 500 }, { "epoch": 0.26522784892413703, "grad_norm": 1.807676911354065, "learning_rate": 4.98014278953421e-05, "loss": 0.2341, "step": 510 }, { "epoch": 0.27042839498147303, "grad_norm": 2.764136791229248, "learning_rate": 4.978299236709826e-05, "loss": 0.339, "step": 520 }, { "epoch": 0.2756289410388091, "grad_norm": 1.2402966022491455, "learning_rate": 4.9763742338679145e-05, "loss": 0.2754, "step": 530 }, { "epoch": 0.2808294870961451, "grad_norm": 1.5016759634017944, "learning_rate": 4.974367844274248e-05, "loss": 0.2544, "step": 540 }, { "epoch": 0.28603003315348113, "grad_norm": 2.2027359008789062, "learning_rate": 4.972280133869396e-05, "loss": 0.232, "step": 550 }, { "epoch": 0.29123057921081713, "grad_norm": 0.8741855621337891, "learning_rate": 4.9701111712665625e-05, "loss": 0.2665, "step": 560 }, { "epoch": 0.29643112526815313, "grad_norm": 2.105534315109253, "learning_rate": 4.9678610277493275e-05, "loss": 0.2719, "step": 570 }, { "epoch": 0.3016316713254892, "grad_norm": 2.820169687271118, "learning_rate": 4.965529777269306e-05, "loss": 0.2875, "step": 580 }, { "epoch": 0.3068322173828252, "grad_norm": 1.968910813331604, "learning_rate": 4.963117496443715e-05, "loss": 0.2547, "step": 590 }, { "epoch": 0.31203276344016123, "grad_norm": 1.4258973598480225, "learning_rate": 4.960624264552858e-05, "loss": 0.3096, "step": 600 }, { "epoch": 0.31723330949749723, "grad_norm": 0.6942580342292786, "learning_rate": 4.958050163537519e-05, "loss": 0.2271, "step": 610 }, { "epoch": 0.3224338555548333, "grad_norm": 2.4023945331573486, "learning_rate": 4.955395277996268e-05, "loss": 0.2973, "step": 620 }, { "epoch": 0.3276344016121693, "grad_norm": 0.890560507774353, "learning_rate": 4.9526596951826824e-05, "loss": 0.2368, "step": 630 }, { "epoch": 0.3328349476695053, "grad_norm": 1.4097232818603516, "learning_rate": 4.949843505002477e-05, "loss": 0.1829, "step": 640 }, { "epoch": 0.33803549372684133, "grad_norm": 1.28754723072052, "learning_rate": 4.946946800010556e-05, "loss": 0.3505, "step": 650 }, { "epoch": 0.3432360397841773, "grad_norm": 0.8762970566749573, "learning_rate": 4.9439696754079595e-05, "loss": 0.2356, "step": 660 }, { "epoch": 0.3484365858415134, "grad_norm": 2.1406095027923584, "learning_rate": 4.940912229038745e-05, "loss": 0.2232, "step": 670 }, { "epoch": 0.3536371318988494, "grad_norm": 1.4764164686203003, "learning_rate": 4.937774561386768e-05, "loss": 0.2281, "step": 680 }, { "epoch": 0.3588376779561854, "grad_norm": 1.5396536588668823, "learning_rate": 4.934556775572377e-05, "loss": 0.2875, "step": 690 }, { "epoch": 0.3640382240135214, "grad_norm": 1.0842628479003906, "learning_rate": 4.9312589773490304e-05, "loss": 0.2562, "step": 700 }, { "epoch": 0.3692387700708574, "grad_norm": 1.8963087797164917, "learning_rate": 4.927881275099815e-05, "loss": 0.2413, "step": 710 }, { "epoch": 0.3744393161281935, "grad_norm": 1.5899958610534668, "learning_rate": 4.9244237798338866e-05, "loss": 0.2979, "step": 720 }, { "epoch": 0.3796398621855295, "grad_norm": 0.8220577836036682, "learning_rate": 4.920886605182823e-05, "loss": 0.2868, "step": 730 }, { "epoch": 0.38484040824286553, "grad_norm": 1.0545523166656494, "learning_rate": 4.917269867396886e-05, "loss": 0.194, "step": 740 }, { "epoch": 0.3900409543002015, "grad_norm": 1.3721591234207153, "learning_rate": 4.913573685341205e-05, "loss": 0.2109, "step": 750 }, { "epoch": 0.3952415003575375, "grad_norm": 0.9382643699645996, "learning_rate": 4.909798180491865e-05, "loss": 0.2194, "step": 760 }, { "epoch": 0.4004420464148736, "grad_norm": 0.6716025471687317, "learning_rate": 4.9059434769319205e-05, "loss": 0.2021, "step": 770 }, { "epoch": 0.4056425924722096, "grad_norm": 2.405698537826538, "learning_rate": 4.902009701347313e-05, "loss": 0.2933, "step": 780 }, { "epoch": 0.4108431385295456, "grad_norm": 1.7277915477752686, "learning_rate": 4.8979969830227086e-05, "loss": 0.2376, "step": 790 }, { "epoch": 0.4160436845868816, "grad_norm": 1.790748119354248, "learning_rate": 4.8939054538372496e-05, "loss": 0.2227, "step": 800 }, { "epoch": 0.4212442306442176, "grad_norm": 1.2813634872436523, "learning_rate": 4.889735248260221e-05, "loss": 0.2544, "step": 810 }, { "epoch": 0.4264447767015537, "grad_norm": 0.9295778870582581, "learning_rate": 4.8854865033466275e-05, "loss": 0.1625, "step": 820 }, { "epoch": 0.43164532275888967, "grad_norm": 1.9681141376495361, "learning_rate": 4.881159358732694e-05, "loss": 0.2262, "step": 830 }, { "epoch": 0.4368458688162257, "grad_norm": 1.1844898462295532, "learning_rate": 4.8767539566312734e-05, "loss": 0.2683, "step": 840 }, { "epoch": 0.4420464148735617, "grad_norm": 1.1099355220794678, "learning_rate": 4.8722704418271745e-05, "loss": 0.2281, "step": 850 }, { "epoch": 0.4472469609308977, "grad_norm": 1.4917421340942383, "learning_rate": 4.867708961672399e-05, "loss": 0.3092, "step": 860 }, { "epoch": 0.45244750698823377, "grad_norm": 1.1806445121765137, "learning_rate": 4.863069666081307e-05, "loss": 0.2272, "step": 870 }, { "epoch": 0.45764805304556977, "grad_norm": 1.3496099710464478, "learning_rate": 4.8583527075256804e-05, "loss": 0.2299, "step": 880 }, { "epoch": 0.4628485991029058, "grad_norm": 2.9580721855163574, "learning_rate": 4.853558241029723e-05, "loss": 0.2648, "step": 890 }, { "epoch": 0.4680491451602418, "grad_norm": 0.47517985105514526, "learning_rate": 4.848686424164953e-05, "loss": 0.2166, "step": 900 }, { "epoch": 0.4732496912175779, "grad_norm": 1.1966201066970825, "learning_rate": 4.8437374170450344e-05, "loss": 0.2499, "step": 910 }, { "epoch": 0.47845023727491387, "grad_norm": 1.4806653261184692, "learning_rate": 4.8387113823205096e-05, "loss": 0.2532, "step": 920 }, { "epoch": 0.48365078333224987, "grad_norm": 1.9070792198181152, "learning_rate": 4.833608485173457e-05, "loss": 0.2721, "step": 930 }, { "epoch": 0.4888513293895859, "grad_norm": 1.1496449708938599, "learning_rate": 4.8284288933120594e-05, "loss": 0.2181, "step": 940 }, { "epoch": 0.4940518754469219, "grad_norm": 1.1686209440231323, "learning_rate": 4.823172776965094e-05, "loss": 0.2084, "step": 950 }, { "epoch": 0.49925242150425797, "grad_norm": 1.7963812351226807, "learning_rate": 4.8178403088763355e-05, "loss": 0.2436, "step": 960 }, { "epoch": 0.504452967561594, "grad_norm": 1.3361034393310547, "learning_rate": 4.812431664298883e-05, "loss": 0.1777, "step": 970 }, { "epoch": 0.50965351361893, "grad_norm": 0.7462561726570129, "learning_rate": 4.8069470209893974e-05, "loss": 0.2749, "step": 980 }, { "epoch": 0.514854059676266, "grad_norm": 1.4435970783233643, "learning_rate": 4.801386559202259e-05, "loss": 0.2099, "step": 990 }, { "epoch": 0.5200546057336021, "grad_norm": 1.6081739664077759, "learning_rate": 4.795750461683644e-05, "loss": 0.2594, "step": 1000 }, { "epoch": 0.5200546057336021, "eval_loss": 0.24056576192378998, "eval_runtime": 134.5423, "eval_samples_per_second": 14.285, "eval_steps_per_second": 14.285, "step": 1000 }, { "epoch": 0.5252551517909381, "grad_norm": 0.9048750996589661, "learning_rate": 4.790038913665519e-05, "loss": 0.2459, "step": 1010 }, { "epoch": 0.5304556978482741, "grad_norm": 1.2910796403884888, "learning_rate": 4.7842521028595526e-05, "loss": 0.2357, "step": 1020 }, { "epoch": 0.5356562439056101, "grad_norm": 1.6829766035079956, "learning_rate": 4.778390219450949e-05, "loss": 0.2348, "step": 1030 }, { "epoch": 0.5408567899629461, "grad_norm": 2.526048421859741, "learning_rate": 4.772453456092191e-05, "loss": 0.2503, "step": 1040 }, { "epoch": 0.5460573360202822, "grad_norm": 0.8338559865951538, "learning_rate": 4.766442007896715e-05, "loss": 0.1851, "step": 1050 }, { "epoch": 0.5512578820776182, "grad_norm": 2.0072736740112305, "learning_rate": 4.760356072432498e-05, "loss": 0.3063, "step": 1060 }, { "epoch": 0.5564584281349542, "grad_norm": 2.7068746089935303, "learning_rate": 4.754195849715557e-05, "loss": 0.2264, "step": 1070 }, { "epoch": 0.5616589741922902, "grad_norm": 1.7025487422943115, "learning_rate": 4.747961542203386e-05, "loss": 0.1975, "step": 1080 }, { "epoch": 0.5668595202496262, "grad_norm": 1.6216896772384644, "learning_rate": 4.741653354788295e-05, "loss": 0.232, "step": 1090 }, { "epoch": 0.5720600663069623, "grad_norm": 1.5931206941604614, "learning_rate": 4.735271494790678e-05, "loss": 0.2607, "step": 1100 }, { "epoch": 0.5772606123642983, "grad_norm": 1.2996855974197388, "learning_rate": 4.7288161719522016e-05, "loss": 0.2148, "step": 1110 }, { "epoch": 0.5824611584216343, "grad_norm": 1.3389666080474854, "learning_rate": 4.722287598428907e-05, "loss": 0.2831, "step": 1120 }, { "epoch": 0.5876617044789703, "grad_norm": 2.0776829719543457, "learning_rate": 4.7156859887842416e-05, "loss": 0.3034, "step": 1130 }, { "epoch": 0.5928622505363063, "grad_norm": 0.8629754781723022, "learning_rate": 4.709011559982006e-05, "loss": 0.2287, "step": 1140 }, { "epoch": 0.5980627965936424, "grad_norm": 1.2654669284820557, "learning_rate": 4.7022645313792235e-05, "loss": 0.2223, "step": 1150 }, { "epoch": 0.6032633426509784, "grad_norm": 1.1408824920654297, "learning_rate": 4.695445124718931e-05, "loss": 0.1832, "step": 1160 }, { "epoch": 0.6084638887083144, "grad_norm": 1.0831233263015747, "learning_rate": 4.6885535641228904e-05, "loss": 0.2787, "step": 1170 }, { "epoch": 0.6136644347656504, "grad_norm": 1.243690848350525, "learning_rate": 4.6815900760842236e-05, "loss": 0.2505, "step": 1180 }, { "epoch": 0.6188649808229865, "grad_norm": 2.173030138015747, "learning_rate": 4.674554889459968e-05, "loss": 0.2526, "step": 1190 }, { "epoch": 0.6240655268803225, "grad_norm": 1.0949965715408325, "learning_rate": 4.667448235463557e-05, "loss": 0.233, "step": 1200 }, { "epoch": 0.6292660729376585, "grad_norm": 2.3284902572631836, "learning_rate": 4.660270347657219e-05, "loss": 0.2447, "step": 1210 }, { "epoch": 0.6344666189949945, "grad_norm": 1.0869665145874023, "learning_rate": 4.6530214619443037e-05, "loss": 0.2217, "step": 1220 }, { "epoch": 0.6396671650523305, "grad_norm": 1.639493465423584, "learning_rate": 4.645701816561523e-05, "loss": 0.2322, "step": 1230 }, { "epoch": 0.6448677111096666, "grad_norm": 1.2198299169540405, "learning_rate": 4.63831165207113e-05, "loss": 0.1883, "step": 1240 }, { "epoch": 0.6500682571670026, "grad_norm": 1.4124974012374878, "learning_rate": 4.630851211353007e-05, "loss": 0.2559, "step": 1250 }, { "epoch": 0.6552688032243386, "grad_norm": 1.7080676555633545, "learning_rate": 4.623320739596685e-05, "loss": 0.2219, "step": 1260 }, { "epoch": 0.6604693492816746, "grad_norm": 2.443284511566162, "learning_rate": 4.615720484293286e-05, "loss": 0.2324, "step": 1270 }, { "epoch": 0.6656698953390106, "grad_norm": 0.6745538115501404, "learning_rate": 4.608050695227385e-05, "loss": 0.2877, "step": 1280 }, { "epoch": 0.6708704413963467, "grad_norm": 1.1423040628433228, "learning_rate": 4.60031162446881e-05, "loss": 0.2469, "step": 1290 }, { "epoch": 0.6760709874536827, "grad_norm": 1.5825380086898804, "learning_rate": 4.5925035263643444e-05, "loss": 0.2699, "step": 1300 }, { "epoch": 0.6812715335110187, "grad_norm": 1.138910174369812, "learning_rate": 4.5846266575293816e-05, "loss": 0.2457, "step": 1310 }, { "epoch": 0.6864720795683547, "grad_norm": 1.3718457221984863, "learning_rate": 4.576681276839483e-05, "loss": 0.2485, "step": 1320 }, { "epoch": 0.6916726256256907, "grad_norm": 1.4293012619018555, "learning_rate": 4.56866764542187e-05, "loss": 0.2458, "step": 1330 }, { "epoch": 0.6968731716830268, "grad_norm": 1.009885311126709, "learning_rate": 4.560586026646845e-05, "loss": 0.2077, "step": 1340 }, { "epoch": 0.7020737177403628, "grad_norm": 0.6243613362312317, "learning_rate": 4.552436686119134e-05, "loss": 0.2204, "step": 1350 }, { "epoch": 0.7072742637976988, "grad_norm": 1.6868172883987427, "learning_rate": 4.54421989166916e-05, "loss": 0.2372, "step": 1360 }, { "epoch": 0.7124748098550348, "grad_norm": 1.7123680114746094, "learning_rate": 4.5359359133442356e-05, "loss": 0.2613, "step": 1370 }, { "epoch": 0.7176753559123707, "grad_norm": 0.856176495552063, "learning_rate": 4.5275850233996925e-05, "loss": 0.2438, "step": 1380 }, { "epoch": 0.7228759019697069, "grad_norm": 1.1216453313827515, "learning_rate": 4.5191674962899314e-05, "loss": 0.2029, "step": 1390 }, { "epoch": 0.7280764480270429, "grad_norm": 1.8667545318603516, "learning_rate": 4.510683608659403e-05, "loss": 0.1938, "step": 1400 }, { "epoch": 0.7332769940843789, "grad_norm": 1.677372932434082, "learning_rate": 4.502133639333516e-05, "loss": 0.2053, "step": 1410 }, { "epoch": 0.7384775401417148, "grad_norm": 1.217119574546814, "learning_rate": 4.4935178693094714e-05, "loss": 0.1992, "step": 1420 }, { "epoch": 0.7436780861990508, "grad_norm": 2.1485345363616943, "learning_rate": 4.484836581747032e-05, "loss": 0.2454, "step": 1430 }, { "epoch": 0.748878632256387, "grad_norm": 1.3972569704055786, "learning_rate": 4.4760900619592085e-05, "loss": 0.1673, "step": 1440 }, { "epoch": 0.754079178313723, "grad_norm": 1.4621198177337646, "learning_rate": 4.467278597402894e-05, "loss": 0.2137, "step": 1450 }, { "epoch": 0.759279724371059, "grad_norm": 1.6665892601013184, "learning_rate": 4.4584024776694035e-05, "loss": 0.1556, "step": 1460 }, { "epoch": 0.764480270428395, "grad_norm": 1.4974132776260376, "learning_rate": 4.449461994474968e-05, "loss": 0.278, "step": 1470 }, { "epoch": 0.7696808164857311, "grad_norm": 0.9022512435913086, "learning_rate": 4.440457441651139e-05, "loss": 0.1929, "step": 1480 }, { "epoch": 0.774881362543067, "grad_norm": 1.8019062280654907, "learning_rate": 4.4313891151351375e-05, "loss": 0.2594, "step": 1490 }, { "epoch": 0.780081908600403, "grad_norm": 1.0030608177185059, "learning_rate": 4.422257312960123e-05, "loss": 0.1938, "step": 1500 }, { "epoch": 0.780081908600403, "eval_loss": 0.2387997955083847, "eval_runtime": 136.4254, "eval_samples_per_second": 14.088, "eval_steps_per_second": 14.088, "step": 1500 }, { "epoch": 0.785282454657739, "grad_norm": 1.8986437320709229, "learning_rate": 4.413062335245402e-05, "loss": 0.2154, "step": 1510 }, { "epoch": 0.790483000715075, "grad_norm": 1.5987744331359863, "learning_rate": 4.4038044841865614e-05, "loss": 0.2624, "step": 1520 }, { "epoch": 0.7956835467724112, "grad_norm": 1.032251000404358, "learning_rate": 4.394484064045542e-05, "loss": 0.2311, "step": 1530 }, { "epoch": 0.8008840928297472, "grad_norm": 1.9166332483291626, "learning_rate": 4.385101381140633e-05, "loss": 0.2384, "step": 1540 }, { "epoch": 0.8060846388870831, "grad_norm": 0.6986478567123413, "learning_rate": 4.375656743836407e-05, "loss": 0.1841, "step": 1550 }, { "epoch": 0.8112851849444191, "grad_norm": 0.631565511226654, "learning_rate": 4.366150462533588e-05, "loss": 0.2398, "step": 1560 }, { "epoch": 0.8164857310017551, "grad_norm": 1.0940667390823364, "learning_rate": 4.356582849658845e-05, "loss": 0.1876, "step": 1570 }, { "epoch": 0.8216862770590913, "grad_norm": 0.7327963709831238, "learning_rate": 4.34695421965453e-05, "loss": 0.2551, "step": 1580 }, { "epoch": 0.8268868231164272, "grad_norm": 1.5531721115112305, "learning_rate": 4.3372648889683364e-05, "loss": 0.1719, "step": 1590 }, { "epoch": 0.8320873691737632, "grad_norm": 0.8876403570175171, "learning_rate": 4.3275151760429075e-05, "loss": 0.2152, "step": 1600 }, { "epoch": 0.8372879152310992, "grad_norm": 2.079756259918213, "learning_rate": 4.317705401305362e-05, "loss": 0.2369, "step": 1610 }, { "epoch": 0.8424884612884352, "grad_norm": 1.2363635301589966, "learning_rate": 4.3078358871567706e-05, "loss": 0.2718, "step": 1620 }, { "epoch": 0.8476890073457714, "grad_norm": 1.3667513132095337, "learning_rate": 4.2979069579615564e-05, "loss": 0.2221, "step": 1630 }, { "epoch": 0.8528895534031073, "grad_norm": 1.1651591062545776, "learning_rate": 4.2879189400368314e-05, "loss": 0.2858, "step": 1640 }, { "epoch": 0.8580900994604433, "grad_norm": 0.9213271141052246, "learning_rate": 4.277872161641682e-05, "loss": 0.2187, "step": 1650 }, { "epoch": 0.8632906455177793, "grad_norm": 0.8052433133125305, "learning_rate": 4.267766952966369e-05, "loss": 0.2695, "step": 1660 }, { "epoch": 0.8684911915751153, "grad_norm": 1.9036948680877686, "learning_rate": 4.257603646121484e-05, "loss": 0.2253, "step": 1670 }, { "epoch": 0.8736917376324514, "grad_norm": 0.8116464018821716, "learning_rate": 4.247382575127031e-05, "loss": 0.2417, "step": 1680 }, { "epoch": 0.8788922836897874, "grad_norm": 1.7750636339187622, "learning_rate": 4.237104075901449e-05, "loss": 0.2438, "step": 1690 }, { "epoch": 0.8840928297471234, "grad_norm": 0.9960026144981384, "learning_rate": 4.226768486250572e-05, "loss": 0.2928, "step": 1700 }, { "epoch": 0.8892933758044594, "grad_norm": 1.5663594007492065, "learning_rate": 4.216376145856529e-05, "loss": 0.249, "step": 1710 }, { "epoch": 0.8944939218617954, "grad_norm": 2.8207902908325195, "learning_rate": 4.205927396266577e-05, "loss": 0.233, "step": 1720 }, { "epoch": 0.8996944679191315, "grad_norm": 0.683710515499115, "learning_rate": 4.195422580881878e-05, "loss": 0.1886, "step": 1730 }, { "epoch": 0.9048950139764675, "grad_norm": 1.2048577070236206, "learning_rate": 4.1848620449462115e-05, "loss": 0.205, "step": 1740 }, { "epoch": 0.9100955600338035, "grad_norm": 1.833343505859375, "learning_rate": 4.17424613553463e-05, "loss": 0.2846, "step": 1750 }, { "epoch": 0.9152961060911395, "grad_norm": 1.2163664102554321, "learning_rate": 4.163575201542052e-05, "loss": 0.2269, "step": 1760 }, { "epoch": 0.9204966521484755, "grad_norm": 0.7797666788101196, "learning_rate": 4.152849593671793e-05, "loss": 0.1856, "step": 1770 }, { "epoch": 0.9256971982058116, "grad_norm": 1.4620978832244873, "learning_rate": 4.142069664424041e-05, "loss": 0.2599, "step": 1780 }, { "epoch": 0.9308977442631476, "grad_norm": 0.480034202337265, "learning_rate": 4.1312357680842735e-05, "loss": 0.2485, "step": 1790 }, { "epoch": 0.9360982903204836, "grad_norm": 1.0644006729125977, "learning_rate": 4.120348260711611e-05, "loss": 0.2576, "step": 1800 }, { "epoch": 0.9412988363778196, "grad_norm": 1.8595833778381348, "learning_rate": 4.109407500127116e-05, "loss": 0.2438, "step": 1810 }, { "epoch": 0.9464993824351557, "grad_norm": 0.9909834861755371, "learning_rate": 4.098413845902033e-05, "loss": 0.241, "step": 1820 }, { "epoch": 0.9516999284924917, "grad_norm": 1.157691478729248, "learning_rate": 4.0873676593459725e-05, "loss": 0.2383, "step": 1830 }, { "epoch": 0.9569004745498277, "grad_norm": 1.2096604108810425, "learning_rate": 4.076269303495033e-05, "loss": 0.2554, "step": 1840 }, { "epoch": 0.9621010206071637, "grad_norm": 0.8286678194999695, "learning_rate": 4.065119143099874e-05, "loss": 0.1894, "step": 1850 }, { "epoch": 0.9673015666644997, "grad_norm": 0.9873716235160828, "learning_rate": 4.053917544613723e-05, "loss": 0.2311, "step": 1860 }, { "epoch": 0.9725021127218358, "grad_norm": 0.9408676028251648, "learning_rate": 4.042664876180341e-05, "loss": 0.2386, "step": 1870 }, { "epoch": 0.9777026587791718, "grad_norm": 0.6958754062652588, "learning_rate": 4.031361507621911e-05, "loss": 0.2468, "step": 1880 }, { "epoch": 0.9829032048365078, "grad_norm": 0.8920957446098328, "learning_rate": 4.0200078104268944e-05, "loss": 0.2584, "step": 1890 }, { "epoch": 0.9881037508938438, "grad_norm": 1.3254570960998535, "learning_rate": 4.0086041577378166e-05, "loss": 0.2755, "step": 1900 }, { "epoch": 0.9933042969511798, "grad_norm": 1.2101293802261353, "learning_rate": 3.9971509243390025e-05, "loss": 0.2417, "step": 1910 }, { "epoch": 0.9985048430085159, "grad_norm": 0.42130109667778015, "learning_rate": 3.985648486644267e-05, "loss": 0.1982, "step": 1920 }, { "epoch": 1.0036403822401352, "grad_norm": 2.4333481788635254, "learning_rate": 3.974097222684532e-05, "loss": 0.2277, "step": 1930 }, { "epoch": 1.0088409282974713, "grad_norm": 1.6568609476089478, "learning_rate": 3.962497512095412e-05, "loss": 0.1901, "step": 1940 }, { "epoch": 1.0140414743548072, "grad_norm": 1.0351656675338745, "learning_rate": 3.9508497361047334e-05, "loss": 0.2923, "step": 1950 }, { "epoch": 1.0192420204121433, "grad_norm": 0.8283625245094299, "learning_rate": 3.939154277520006e-05, "loss": 0.2245, "step": 1960 }, { "epoch": 1.0244425664694794, "grad_norm": 0.6887472867965698, "learning_rate": 3.92741152071584e-05, "loss": 0.1447, "step": 1970 }, { "epoch": 1.0296431125268153, "grad_norm": 2.1077232360839844, "learning_rate": 3.915621851621318e-05, "loss": 0.2368, "step": 1980 }, { "epoch": 1.0348436585841514, "grad_norm": 0.7262524366378784, "learning_rate": 3.903785657707307e-05, "loss": 0.2153, "step": 1990 }, { "epoch": 1.0400442046414873, "grad_norm": 0.6093840003013611, "learning_rate": 3.8919033279737274e-05, "loss": 0.1695, "step": 2000 }, { "epoch": 1.0400442046414873, "eval_loss": 0.24628731608390808, "eval_runtime": 134.8334, "eval_samples_per_second": 14.255, "eval_steps_per_second": 14.255, "step": 2000 }, { "epoch": 1.0452447506988234, "grad_norm": 1.6017835140228271, "learning_rate": 3.879975252936761e-05, "loss": 0.202, "step": 2010 }, { "epoch": 1.0504452967561595, "grad_norm": 1.7225841283798218, "learning_rate": 3.8680018246160295e-05, "loss": 0.1952, "step": 2020 }, { "epoch": 1.0556458428134954, "grad_norm": 2.1085808277130127, "learning_rate": 3.855983436521699e-05, "loss": 0.2721, "step": 2030 }, { "epoch": 1.0608463888708315, "grad_norm": 0.8755818605422974, "learning_rate": 3.843920483641551e-05, "loss": 0.2199, "step": 2040 }, { "epoch": 1.0660469349281674, "grad_norm": 0.6190668344497681, "learning_rate": 3.831813362428005e-05, "loss": 0.1944, "step": 2050 }, { "epoch": 1.0712474809855035, "grad_norm": 0.6328080296516418, "learning_rate": 3.819662470785082e-05, "loss": 0.2687, "step": 2060 }, { "epoch": 1.0764480270428396, "grad_norm": 1.3243086338043213, "learning_rate": 3.8074682080553335e-05, "loss": 0.1866, "step": 2070 }, { "epoch": 1.0816485731001755, "grad_norm": 1.4289870262145996, "learning_rate": 3.795230975006712e-05, "loss": 0.1979, "step": 2080 }, { "epoch": 1.0868491191575116, "grad_norm": 1.1440227031707764, "learning_rate": 3.782951173819403e-05, "loss": 0.2097, "step": 2090 }, { "epoch": 1.0920496652148475, "grad_norm": 0.7256899476051331, "learning_rate": 3.7706292080726055e-05, "loss": 0.2522, "step": 2100 }, { "epoch": 1.0972502112721836, "grad_norm": 1.0164716243743896, "learning_rate": 3.75826548273127e-05, "loss": 0.2312, "step": 2110 }, { "epoch": 1.1024507573295197, "grad_norm": 1.053582787513733, "learning_rate": 3.7458604041327874e-05, "loss": 0.1406, "step": 2120 }, { "epoch": 1.1076513033868556, "grad_norm": 1.578212022781372, "learning_rate": 3.733414379973635e-05, "loss": 0.1913, "step": 2130 }, { "epoch": 1.1128518494441917, "grad_norm": 1.1891608238220215, "learning_rate": 3.720927819295979e-05, "loss": 0.2298, "step": 2140 }, { "epoch": 1.1180523955015276, "grad_norm": 0.4603135585784912, "learning_rate": 3.708401132474228e-05, "loss": 0.2261, "step": 2150 }, { "epoch": 1.1232529415588637, "grad_norm": 2.1462292671203613, "learning_rate": 3.695834731201548e-05, "loss": 0.2354, "step": 2160 }, { "epoch": 1.1284534876161998, "grad_norm": 1.139315128326416, "learning_rate": 3.683229028476334e-05, "loss": 0.1615, "step": 2170 }, { "epoch": 1.1336540336735357, "grad_norm": 1.1548924446105957, "learning_rate": 3.6705844385886334e-05, "loss": 0.1705, "step": 2180 }, { "epoch": 1.1388545797308718, "grad_norm": 1.0922483205795288, "learning_rate": 3.6579013771065305e-05, "loss": 0.1906, "step": 2190 }, { "epoch": 1.1440551257882077, "grad_norm": 0.8926368951797485, "learning_rate": 3.645180260862492e-05, "loss": 0.1744, "step": 2200 }, { "epoch": 1.1492556718455438, "grad_norm": 1.1546534299850464, "learning_rate": 3.632421507939661e-05, "loss": 0.2112, "step": 2210 }, { "epoch": 1.1544562179028799, "grad_norm": 1.9052295684814453, "learning_rate": 3.6196255376581254e-05, "loss": 0.2351, "step": 2220 }, { "epoch": 1.1596567639602158, "grad_norm": 0.9189292788505554, "learning_rate": 3.6067927705611304e-05, "loss": 0.2165, "step": 2230 }, { "epoch": 1.1648573100175519, "grad_norm": 0.5956322550773621, "learning_rate": 3.593923628401259e-05, "loss": 0.2127, "step": 2240 }, { "epoch": 1.1700578560748878, "grad_norm": 2.0540506839752197, "learning_rate": 3.581018534126571e-05, "loss": 0.2175, "step": 2250 }, { "epoch": 1.1752584021322239, "grad_norm": 0.8053009510040283, "learning_rate": 3.568077911866703e-05, "loss": 0.2046, "step": 2260 }, { "epoch": 1.18045894818956, "grad_norm": 1.437412142753601, "learning_rate": 3.5551021869189286e-05, "loss": 0.2297, "step": 2270 }, { "epoch": 1.1856594942468959, "grad_norm": 0.7657543420791626, "learning_rate": 3.542091785734184e-05, "loss": 0.1784, "step": 2280 }, { "epoch": 1.190860040304232, "grad_norm": 1.170629620552063, "learning_rate": 3.529047135903045e-05, "loss": 0.1824, "step": 2290 }, { "epoch": 1.1960605863615679, "grad_norm": 1.3208539485931396, "learning_rate": 3.5159686661416834e-05, "loss": 0.1682, "step": 2300 }, { "epoch": 1.201261132418904, "grad_norm": 0.5824002027511597, "learning_rate": 3.502856806277773e-05, "loss": 0.1631, "step": 2310 }, { "epoch": 1.20646167847624, "grad_norm": 2.711642265319824, "learning_rate": 3.489711987236357e-05, "loss": 0.1973, "step": 2320 }, { "epoch": 1.211662224533576, "grad_norm": 0.9232580661773682, "learning_rate": 3.476534641025698e-05, "loss": 0.246, "step": 2330 }, { "epoch": 1.216862770590912, "grad_norm": 1.4809739589691162, "learning_rate": 3.463325200723071e-05, "loss": 0.2476, "step": 2340 }, { "epoch": 1.222063316648248, "grad_norm": 1.0022258758544922, "learning_rate": 3.4500841004605324e-05, "loss": 0.1629, "step": 2350 }, { "epoch": 1.227263862705584, "grad_norm": 0.6187863945960999, "learning_rate": 3.436811775410651e-05, "loss": 0.2049, "step": 2360 }, { "epoch": 1.2324644087629202, "grad_norm": 1.0579588413238525, "learning_rate": 3.42350866177221e-05, "loss": 0.1923, "step": 2370 }, { "epoch": 1.237664954820256, "grad_norm": 0.8715612888336182, "learning_rate": 3.410175196755866e-05, "loss": 0.1777, "step": 2380 }, { "epoch": 1.2428655008775922, "grad_norm": 1.0652248859405518, "learning_rate": 3.396811818569785e-05, "loss": 0.258, "step": 2390 }, { "epoch": 1.248066046934928, "grad_norm": 1.5773491859436035, "learning_rate": 3.383418966405234e-05, "loss": 0.2021, "step": 2400 }, { "epoch": 1.2532665929922642, "grad_norm": 1.5874974727630615, "learning_rate": 3.369997080422155e-05, "loss": 0.2206, "step": 2410 }, { "epoch": 1.2584671390496003, "grad_norm": 1.1131178140640259, "learning_rate": 3.356546601734692e-05, "loss": 0.2099, "step": 2420 }, { "epoch": 1.2636676851069362, "grad_norm": 1.019285798072815, "learning_rate": 3.3430679723966976e-05, "loss": 0.2599, "step": 2430 }, { "epoch": 1.2688682311642723, "grad_norm": 1.3517482280731201, "learning_rate": 3.3295616353872026e-05, "loss": 0.1706, "step": 2440 }, { "epoch": 1.2740687772216082, "grad_norm": 1.2477843761444092, "learning_rate": 3.3160280345958614e-05, "loss": 0.2172, "step": 2450 }, { "epoch": 1.2792693232789443, "grad_norm": 0.7591115236282349, "learning_rate": 3.3024676148083555e-05, "loss": 0.2201, "step": 2460 }, { "epoch": 1.2844698693362804, "grad_norm": 1.461832046508789, "learning_rate": 3.288880821691785e-05, "loss": 0.1695, "step": 2470 }, { "epoch": 1.2896704153936163, "grad_norm": 1.8396881818771362, "learning_rate": 3.2752681017800144e-05, "loss": 0.175, "step": 2480 }, { "epoch": 1.2948709614509524, "grad_norm": 1.3018438816070557, "learning_rate": 3.261629902459e-05, "loss": 0.2071, "step": 2490 }, { "epoch": 1.3000715075082883, "grad_norm": 1.120477557182312, "learning_rate": 3.2479666719520886e-05, "loss": 0.1841, "step": 2500 }, { "epoch": 1.3000715075082883, "eval_loss": 0.23911671340465546, "eval_runtime": 135.2893, "eval_samples_per_second": 14.207, "eval_steps_per_second": 14.207, "step": 2500 }, { "epoch": 1.3052720535656244, "grad_norm": 2.349160671234131, "learning_rate": 3.23427885930528e-05, "loss": 0.1993, "step": 2510 }, { "epoch": 1.3104725996229605, "grad_norm": 0.9985238313674927, "learning_rate": 3.220566914372477e-05, "loss": 0.1448, "step": 2520 }, { "epoch": 1.3156731456802964, "grad_norm": 1.038683295249939, "learning_rate": 3.2068312878006955e-05, "loss": 0.1793, "step": 2530 }, { "epoch": 1.3208736917376325, "grad_norm": 1.3996448516845703, "learning_rate": 3.193072431015254e-05, "loss": 0.1495, "step": 2540 }, { "epoch": 1.3260742377949684, "grad_norm": 1.8597303628921509, "learning_rate": 3.17929079620494e-05, "loss": 0.1746, "step": 2550 }, { "epoch": 1.3312747838523045, "grad_norm": 0.5454281568527222, "learning_rate": 3.1654868363071484e-05, "loss": 0.1633, "step": 2560 }, { "epoch": 1.3364753299096406, "grad_norm": 2.386983871459961, "learning_rate": 3.151661004992992e-05, "loss": 0.2391, "step": 2570 }, { "epoch": 1.3416758759669765, "grad_norm": 1.90854811668396, "learning_rate": 3.137813756652395e-05, "loss": 0.1816, "step": 2580 }, { "epoch": 1.3468764220243126, "grad_norm": 0.8159545063972473, "learning_rate": 3.12394554637916e-05, "loss": 0.235, "step": 2590 }, { "epoch": 1.3520769680816485, "grad_norm": 1.6975359916687012, "learning_rate": 3.110056829956006e-05, "loss": 0.1799, "step": 2600 }, { "epoch": 1.3572775141389846, "grad_norm": 1.2948479652404785, "learning_rate": 3.096148063839596e-05, "loss": 0.1747, "step": 2610 }, { "epoch": 1.3624780601963207, "grad_norm": 1.0926662683486938, "learning_rate": 3.08221970514553e-05, "loss": 0.1946, "step": 2620 }, { "epoch": 1.3676786062536566, "grad_norm": 2.317523956298828, "learning_rate": 3.068272211633326e-05, "loss": 0.2677, "step": 2630 }, { "epoch": 1.3728791523109927, "grad_norm": 1.379921555519104, "learning_rate": 3.0543060416913696e-05, "loss": 0.2897, "step": 2640 }, { "epoch": 1.3780796983683286, "grad_norm": 1.2815351486206055, "learning_rate": 3.0403216543218547e-05, "loss": 0.205, "step": 2650 }, { "epoch": 1.3832802444256647, "grad_norm": 1.7982994318008423, "learning_rate": 3.026319509125697e-05, "loss": 0.1774, "step": 2660 }, { "epoch": 1.3884807904830008, "grad_norm": 2.2039549350738525, "learning_rate": 3.0123000662874272e-05, "loss": 0.1811, "step": 2670 }, { "epoch": 1.3936813365403367, "grad_norm": 1.7380796670913696, "learning_rate": 2.9982637865600683e-05, "loss": 0.2688, "step": 2680 }, { "epoch": 1.3988818825976728, "grad_norm": 0.9833778738975525, "learning_rate": 2.9842111312499914e-05, "loss": 0.1609, "step": 2690 }, { "epoch": 1.4040824286550087, "grad_norm": 2.575516939163208, "learning_rate": 2.9701425622017583e-05, "loss": 0.1734, "step": 2700 }, { "epoch": 1.4092829747123448, "grad_norm": 3.007417678833008, "learning_rate": 2.9560585417829368e-05, "loss": 0.2598, "step": 2710 }, { "epoch": 1.4144835207696809, "grad_norm": 1.1851876974105835, "learning_rate": 2.9419595328689138e-05, "loss": 0.1271, "step": 2720 }, { "epoch": 1.4196840668270168, "grad_norm": 2.1141178607940674, "learning_rate": 2.9278459988276703e-05, "loss": 0.1752, "step": 2730 }, { "epoch": 1.4248846128843529, "grad_norm": 1.5198488235473633, "learning_rate": 2.913718403504567e-05, "loss": 0.2225, "step": 2740 }, { "epoch": 1.4300851589416887, "grad_norm": 0.9600934386253357, "learning_rate": 2.899577211207087e-05, "loss": 0.2169, "step": 2750 }, { "epoch": 1.4352857049990249, "grad_norm": 1.3893183469772339, "learning_rate": 2.8854228866895855e-05, "loss": 0.2257, "step": 2760 }, { "epoch": 1.440486251056361, "grad_norm": 1.2468478679656982, "learning_rate": 2.8712558951380097e-05, "loss": 0.221, "step": 2770 }, { "epoch": 1.445686797113697, "grad_norm": 0.7069809436798096, "learning_rate": 2.857076702154614e-05, "loss": 0.1912, "step": 2780 }, { "epoch": 1.450887343171033, "grad_norm": 1.5114367008209229, "learning_rate": 2.8428857737426556e-05, "loss": 0.2006, "step": 2790 }, { "epoch": 1.4560878892283688, "grad_norm": 0.9951623678207397, "learning_rate": 2.8286835762910803e-05, "loss": 0.1765, "step": 2800 }, { "epoch": 1.461288435285705, "grad_norm": 0.7911898493766785, "learning_rate": 2.8144705765591938e-05, "loss": 0.1737, "step": 2810 }, { "epoch": 1.466488981343041, "grad_norm": 0.7575000524520874, "learning_rate": 2.800247241661321e-05, "loss": 0.2185, "step": 2820 }, { "epoch": 1.4716895274003772, "grad_norm": 1.342424988746643, "learning_rate": 2.7860140390514583e-05, "loss": 0.2083, "step": 2830 }, { "epoch": 1.476890073457713, "grad_norm": 2.5245749950408936, "learning_rate": 2.771771436507903e-05, "loss": 0.1811, "step": 2840 }, { "epoch": 1.482090619515049, "grad_norm": 2.4802660942077637, "learning_rate": 2.757519902117886e-05, "loss": 0.1575, "step": 2850 }, { "epoch": 1.487291165572385, "grad_norm": 1.177516222000122, "learning_rate": 2.743259904262187e-05, "loss": 0.2133, "step": 2860 }, { "epoch": 1.4924917116297212, "grad_norm": 1.1934640407562256, "learning_rate": 2.7289919115997374e-05, "loss": 0.23, "step": 2870 }, { "epoch": 1.4976922576870573, "grad_norm": 1.5221962928771973, "learning_rate": 2.714716393052223e-05, "loss": 0.2154, "step": 2880 }, { "epoch": 1.5028928037443932, "grad_norm": 2.0732405185699463, "learning_rate": 2.7004338177886672e-05, "loss": 0.1759, "step": 2890 }, { "epoch": 1.508093349801729, "grad_norm": 0.8759207129478455, "learning_rate": 2.686144655210016e-05, "loss": 0.2008, "step": 2900 }, { "epoch": 1.5132938958590652, "grad_norm": 0.9305397868156433, "learning_rate": 2.6718493749337105e-05, "loss": 0.1785, "step": 2910 }, { "epoch": 1.5184944419164013, "grad_norm": 0.9819073677062988, "learning_rate": 2.6575484467782486e-05, "loss": 0.2719, "step": 2920 }, { "epoch": 1.5236949879737374, "grad_norm": 2.144178628921509, "learning_rate": 2.6432423407477496e-05, "loss": 0.1598, "step": 2930 }, { "epoch": 1.5288955340310733, "grad_norm": 2.3962485790252686, "learning_rate": 2.6289315270165062e-05, "loss": 0.2127, "step": 2940 }, { "epoch": 1.5340960800884091, "grad_norm": 1.1640074253082275, "learning_rate": 2.6146164759135266e-05, "loss": 0.1784, "step": 2950 }, { "epoch": 1.5392966261457453, "grad_norm": 1.0884958505630493, "learning_rate": 2.6002976579070872e-05, "loss": 0.1717, "step": 2960 }, { "epoch": 1.5444971722030814, "grad_norm": 1.471543312072754, "learning_rate": 2.5859755435892597e-05, "loss": 0.1892, "step": 2970 }, { "epoch": 1.5496977182604175, "grad_norm": 1.1566507816314697, "learning_rate": 2.5716506036604542e-05, "loss": 0.2027, "step": 2980 }, { "epoch": 1.5548982643177534, "grad_norm": 1.8999615907669067, "learning_rate": 2.557323308913942e-05, "loss": 0.2162, "step": 2990 }, { "epoch": 1.5600988103750892, "grad_norm": 1.2542750835418701, "learning_rate": 2.542994130220388e-05, "loss": 0.1548, "step": 3000 }, { "epoch": 1.5600988103750892, "eval_loss": 0.24241599440574646, "eval_runtime": 135.9654, "eval_samples_per_second": 14.136, "eval_steps_per_second": 14.136, "step": 3000 }, { "epoch": 1.5652993564324253, "grad_norm": 2.8087780475616455, "learning_rate": 2.5286635385123725e-05, "loss": 0.24, "step": 3010 }, { "epoch": 1.5704999024897615, "grad_norm": 1.2270337343215942, "learning_rate": 2.5143320047689173e-05, "loss": 0.1968, "step": 3020 }, { "epoch": 1.5757004485470976, "grad_norm": 1.490675926208496, "learning_rate": 2.5e-05, "loss": 0.167, "step": 3030 }, { "epoch": 1.5809009946044335, "grad_norm": 0.7937414646148682, "learning_rate": 2.485667995231084e-05, "loss": 0.1436, "step": 3040 }, { "epoch": 1.5861015406617693, "grad_norm": 1.8276423215866089, "learning_rate": 2.4713364614876274e-05, "loss": 0.2169, "step": 3050 }, { "epoch": 1.5913020867191054, "grad_norm": 2.1891725063323975, "learning_rate": 2.4570058697796125e-05, "loss": 0.2003, "step": 3060 }, { "epoch": 1.5965026327764416, "grad_norm": 1.920414686203003, "learning_rate": 2.4426766910860585e-05, "loss": 0.224, "step": 3070 }, { "epoch": 1.6017031788337777, "grad_norm": 1.974658727645874, "learning_rate": 2.428349396339547e-05, "loss": 0.1934, "step": 3080 }, { "epoch": 1.6069037248911135, "grad_norm": 2.3854596614837646, "learning_rate": 2.4140244564107402e-05, "loss": 0.2128, "step": 3090 }, { "epoch": 1.6121042709484494, "grad_norm": 1.476598858833313, "learning_rate": 2.3997023420929137e-05, "loss": 0.1819, "step": 3100 }, { "epoch": 1.6173048170057855, "grad_norm": 1.3164430856704712, "learning_rate": 2.3853835240864743e-05, "loss": 0.222, "step": 3110 }, { "epoch": 1.6225053630631217, "grad_norm": 1.467546820640564, "learning_rate": 2.3710684729834954e-05, "loss": 0.173, "step": 3120 }, { "epoch": 1.6277059091204578, "grad_norm": 0.9425441026687622, "learning_rate": 2.3567576592522507e-05, "loss": 0.2174, "step": 3130 }, { "epoch": 1.6329064551777936, "grad_norm": 1.062456488609314, "learning_rate": 2.342451553221752e-05, "loss": 0.1934, "step": 3140 }, { "epoch": 1.6381070012351295, "grad_norm": 1.7149615287780762, "learning_rate": 2.32815062506629e-05, "loss": 0.2479, "step": 3150 }, { "epoch": 1.6433075472924656, "grad_norm": 0.6858556866645813, "learning_rate": 2.3138553447899835e-05, "loss": 0.1825, "step": 3160 }, { "epoch": 1.6485080933498018, "grad_norm": 0.9924718737602234, "learning_rate": 2.299566182211333e-05, "loss": 0.155, "step": 3170 }, { "epoch": 1.6537086394071379, "grad_norm": 2.138089656829834, "learning_rate": 2.2852836069477773e-05, "loss": 0.2105, "step": 3180 }, { "epoch": 1.6589091854644737, "grad_norm": 1.5541861057281494, "learning_rate": 2.2710080884002632e-05, "loss": 0.2087, "step": 3190 }, { "epoch": 1.6641097315218096, "grad_norm": 1.846656084060669, "learning_rate": 2.2567400957378132e-05, "loss": 0.1669, "step": 3200 }, { "epoch": 1.6693102775791457, "grad_norm": 2.2019214630126953, "learning_rate": 2.2424800978821146e-05, "loss": 0.1955, "step": 3210 }, { "epoch": 1.6745108236364818, "grad_norm": 0.8931058645248413, "learning_rate": 2.228228563492098e-05, "loss": 0.1679, "step": 3220 }, { "epoch": 1.679711369693818, "grad_norm": 1.5306602716445923, "learning_rate": 2.2139859609485426e-05, "loss": 0.1887, "step": 3230 }, { "epoch": 1.6849119157511538, "grad_norm": 0.7173328399658203, "learning_rate": 2.199752758338679e-05, "loss": 0.1744, "step": 3240 }, { "epoch": 1.6901124618084897, "grad_norm": 2.8038320541381836, "learning_rate": 2.1855294234408068e-05, "loss": 0.2108, "step": 3250 }, { "epoch": 1.6953130078658258, "grad_norm": 1.2980599403381348, "learning_rate": 2.1713164237089203e-05, "loss": 0.1721, "step": 3260 }, { "epoch": 1.700513553923162, "grad_norm": 1.4280049800872803, "learning_rate": 2.1571142262573457e-05, "loss": 0.1959, "step": 3270 }, { "epoch": 1.705714099980498, "grad_norm": 2.656005382537842, "learning_rate": 2.1429232978453862e-05, "loss": 0.2284, "step": 3280 }, { "epoch": 1.710914646037834, "grad_norm": 0.8656441569328308, "learning_rate": 2.128744104861991e-05, "loss": 0.2159, "step": 3290 }, { "epoch": 1.7161151920951698, "grad_norm": 1.6419271230697632, "learning_rate": 2.1145771133104157e-05, "loss": 0.1671, "step": 3300 }, { "epoch": 1.721315738152506, "grad_norm": 1.286908507347107, "learning_rate": 2.1004227887929133e-05, "loss": 0.1683, "step": 3310 }, { "epoch": 1.726516284209842, "grad_norm": 3.205409288406372, "learning_rate": 2.086281596495434e-05, "loss": 0.1585, "step": 3320 }, { "epoch": 1.7317168302671782, "grad_norm": 0.6113395094871521, "learning_rate": 2.07215400117233e-05, "loss": 0.1593, "step": 3330 }, { "epoch": 1.736917376324514, "grad_norm": 1.3752492666244507, "learning_rate": 2.0580404671310878e-05, "loss": 0.2058, "step": 3340 }, { "epoch": 1.7421179223818501, "grad_norm": 0.68391352891922, "learning_rate": 2.0439414582170628e-05, "loss": 0.1796, "step": 3350 }, { "epoch": 1.747318468439186, "grad_norm": 1.9185495376586914, "learning_rate": 2.0298574377982427e-05, "loss": 0.2212, "step": 3360 }, { "epoch": 1.7525190144965221, "grad_norm": 1.3910088539123535, "learning_rate": 2.015788868750009e-05, "loss": 0.1488, "step": 3370 }, { "epoch": 1.7577195605538583, "grad_norm": 0.8257030248641968, "learning_rate": 2.001736213439933e-05, "loss": 0.1957, "step": 3380 }, { "epoch": 1.7629201066111941, "grad_norm": 0.8184394240379333, "learning_rate": 1.987699933712573e-05, "loss": 0.2042, "step": 3390 }, { "epoch": 1.7681206526685302, "grad_norm": 0.9625434875488281, "learning_rate": 1.9736804908743033e-05, "loss": 0.1953, "step": 3400 }, { "epoch": 1.7733211987258661, "grad_norm": 2.588742256164551, "learning_rate": 1.959678345678146e-05, "loss": 0.2007, "step": 3410 }, { "epoch": 1.7785217447832022, "grad_norm": 1.6495355367660522, "learning_rate": 1.9456939583086303e-05, "loss": 0.1823, "step": 3420 }, { "epoch": 1.7837222908405383, "grad_norm": 1.325899600982666, "learning_rate": 1.9317277883666745e-05, "loss": 0.2144, "step": 3430 }, { "epoch": 1.7889228368978742, "grad_norm": 1.2811932563781738, "learning_rate": 1.91778029485447e-05, "loss": 0.2244, "step": 3440 }, { "epoch": 1.7941233829552103, "grad_norm": 1.6615418195724487, "learning_rate": 1.9038519361604046e-05, "loss": 0.1965, "step": 3450 }, { "epoch": 1.7993239290125462, "grad_norm": 1.7860767841339111, "learning_rate": 1.8899431700439946e-05, "loss": 0.206, "step": 3460 }, { "epoch": 1.8045244750698823, "grad_norm": 1.323864221572876, "learning_rate": 1.876054453620841e-05, "loss": 0.1507, "step": 3470 }, { "epoch": 1.8097250211272184, "grad_norm": 1.264664649963379, "learning_rate": 1.8621862433476054e-05, "loss": 0.1847, "step": 3480 }, { "epoch": 1.8149255671845543, "grad_norm": 2.377115249633789, "learning_rate": 1.8483389950070097e-05, "loss": 0.2117, "step": 3490 }, { "epoch": 1.8201261132418904, "grad_norm": 1.387811541557312, "learning_rate": 1.8345131636928518e-05, "loss": 0.2048, "step": 3500 }, { "epoch": 1.8201261132418904, "eval_loss": 0.2365516871213913, "eval_runtime": 134.9588, "eval_samples_per_second": 14.241, "eval_steps_per_second": 14.241, "step": 3500 } ], "logging_steps": 10, "max_steps": 5769, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2327670832608051e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }