{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992228935207619, "eval_steps": 5000, "global_step": 415000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012038830042418818, "grad_norm": 8.390454292297363, "learning_rate": 5.910962367274216e-07, "loss": 4.9044, "step": 500 }, { "epoch": 0.0024077660084837636, "grad_norm": 12.908564567565918, "learning_rate": 1.1930272313581972e-06, "loss": 4.8297, "step": 1000 }, { "epoch": 0.0036116490127256454, "grad_norm": 16.473039627075195, "learning_rate": 1.7949582259889727e-06, "loss": 4.7877, "step": 1500 }, { "epoch": 0.004815532016967527, "grad_norm": 20.620445251464844, "learning_rate": 2.3968892206197483e-06, "loss": 4.6598, "step": 2000 }, { "epoch": 0.0060194150212094085, "grad_norm": 17.269424438476562, "learning_rate": 2.996412491272001e-06, "loss": 4.3792, "step": 2500 }, { "epoch": 0.007223298025451291, "grad_norm": 20.924165725708008, "learning_rate": 3.598343485902776e-06, "loss": 4.1505, "step": 3000 }, { "epoch": 0.008427181029693172, "grad_norm": 22.36298179626465, "learning_rate": 4.200274480533552e-06, "loss": 3.8286, "step": 3500 }, { "epoch": 0.009631064033935054, "grad_norm": 17.17203712463379, "learning_rate": 4.802205475164327e-06, "loss": 3.6786, "step": 4000 }, { "epoch": 0.010834947038176937, "grad_norm": 21.19932746887207, "learning_rate": 5.404136469795103e-06, "loss": 3.5698, "step": 4500 }, { "epoch": 0.012038830042418817, "grad_norm": 26.214515686035156, "learning_rate": 6.0060674644258785e-06, "loss": 3.5362, "step": 5000 }, { "epoch": 0.012038830042418817, "eval_runtime": 6118.3437, "eval_samples_per_second": 135.763, "eval_steps_per_second": 33.941, "step": 5000 }, { "epoch": 0.0132427130466607, "grad_norm": 20.36381721496582, "learning_rate": 6.6079984590566535e-06, "loss": 3.4738, "step": 5500 }, { "epoch": 0.014446596050902582, "grad_norm": 27.98621940612793, "learning_rate": 7.209929453687429e-06, "loss": 3.4793, "step": 6000 }, { "epoch": 0.015650479055144464, "grad_norm": 36.69669723510742, "learning_rate": 7.810656586328943e-06, "loss": 3.4226, "step": 6500 }, { "epoch": 0.016854362059386344, "grad_norm": 19.70973014831543, "learning_rate": 8.412587580959718e-06, "loss": 3.3237, "step": 7000 }, { "epoch": 0.018058245063628225, "grad_norm": 22.874879837036133, "learning_rate": 9.014518575590495e-06, "loss": 3.3729, "step": 7500 }, { "epoch": 0.01926212806787011, "grad_norm": 27.921424865722656, "learning_rate": 9.61644957022127e-06, "loss": 3.3806, "step": 8000 }, { "epoch": 0.02046601107211199, "grad_norm": 20.24401092529297, "learning_rate": 1.0218380564852045e-05, "loss": 3.3732, "step": 8500 }, { "epoch": 0.021669894076353873, "grad_norm": 24.8074893951416, "learning_rate": 1.0820311559482821e-05, "loss": 3.3689, "step": 9000 }, { "epoch": 0.022873777080595754, "grad_norm": 20.271554946899414, "learning_rate": 1.1422242554113596e-05, "loss": 3.3533, "step": 9500 }, { "epoch": 0.024077660084837634, "grad_norm": 25.907699584960938, "learning_rate": 1.2024173548744371e-05, "loss": 3.3256, "step": 10000 }, { "epoch": 0.024077660084837634, "eval_runtime": 5885.2151, "eval_samples_per_second": 141.141, "eval_steps_per_second": 35.285, "step": 10000 }, { "epoch": 0.025281543089079518, "grad_norm": 27.258106231689453, "learning_rate": 1.2624900681385886e-05, "loss": 3.3775, "step": 10500 }, { "epoch": 0.0264854260933214, "grad_norm": 34.32582473754883, "learning_rate": 1.3226831676016663e-05, "loss": 3.3426, "step": 11000 }, { "epoch": 0.02768930909756328, "grad_norm": 32.09712600708008, "learning_rate": 1.3828762670647438e-05, "loss": 3.3584, "step": 11500 }, { "epoch": 0.028893192101805163, "grad_norm": 21.120548248291016, "learning_rate": 1.4430693665278213e-05, "loss": 3.3117, "step": 12000 }, { "epoch": 0.030097075106047044, "grad_norm": 21.870174407958984, "learning_rate": 1.503262465990899e-05, "loss": 3.3063, "step": 12500 }, { "epoch": 0.03130095811028893, "grad_norm": 20.636926651000977, "learning_rate": 1.56333517925505e-05, "loss": 3.3181, "step": 13000 }, { "epoch": 0.03250484111453081, "grad_norm": 17.365741729736328, "learning_rate": 1.623528278718128e-05, "loss": 3.2712, "step": 13500 }, { "epoch": 0.03370872411877269, "grad_norm": 29.897491455078125, "learning_rate": 1.683721378181205e-05, "loss": 3.3179, "step": 14000 }, { "epoch": 0.03491260712301457, "grad_norm": 21.453102111816406, "learning_rate": 1.7439144776442828e-05, "loss": 3.3506, "step": 14500 }, { "epoch": 0.03611649012725645, "grad_norm": 27.832408905029297, "learning_rate": 1.8041075771073605e-05, "loss": 3.3097, "step": 15000 }, { "epoch": 0.03611649012725645, "eval_runtime": 6111.9913, "eval_samples_per_second": 135.904, "eval_steps_per_second": 33.976, "step": 15000 }, { "epoch": 0.03732037313149834, "grad_norm": 27.701480865478516, "learning_rate": 1.864180290371512e-05, "loss": 3.3058, "step": 15500 }, { "epoch": 0.03852425613574022, "grad_norm": 28.425107955932617, "learning_rate": 1.9243733898345895e-05, "loss": 3.3129, "step": 16000 }, { "epoch": 0.0397281391399821, "grad_norm": 20.096038818359375, "learning_rate": 1.9845664892976672e-05, "loss": 3.3126, "step": 16500 }, { "epoch": 0.04093202214422398, "grad_norm": 32.19493103027344, "learning_rate": 2.0447595887607445e-05, "loss": 3.2431, "step": 17000 }, { "epoch": 0.04213590514846586, "grad_norm": 27.442581176757812, "learning_rate": 2.1049526882238222e-05, "loss": 3.3009, "step": 17500 }, { "epoch": 0.043339788152707746, "grad_norm": 23.221601486206055, "learning_rate": 2.1650254014879736e-05, "loss": 3.3023, "step": 18000 }, { "epoch": 0.04454367115694963, "grad_norm": 20.713502883911133, "learning_rate": 2.225218500951051e-05, "loss": 3.2834, "step": 18500 }, { "epoch": 0.04574755416119151, "grad_norm": 26.963842391967773, "learning_rate": 2.2854116004141285e-05, "loss": 3.2512, "step": 19000 }, { "epoch": 0.04695143716543339, "grad_norm": 26.029918670654297, "learning_rate": 2.3456046998772062e-05, "loss": 3.2678, "step": 19500 }, { "epoch": 0.04815532016967527, "grad_norm": 24.989070892333984, "learning_rate": 2.405797799340284e-05, "loss": 3.2962, "step": 20000 }, { "epoch": 0.04815532016967527, "eval_runtime": 6189.8097, "eval_samples_per_second": 134.196, "eval_steps_per_second": 33.549, "step": 20000 }, { "epoch": 0.049359203173917156, "grad_norm": 31.154277801513672, "learning_rate": 2.4659908988033615e-05, "loss": 3.2434, "step": 20500 }, { "epoch": 0.050563086178159036, "grad_norm": 25.946931838989258, "learning_rate": 2.526183998266439e-05, "loss": 3.2261, "step": 21000 }, { "epoch": 0.05176696918240092, "grad_norm": 18.157636642456055, "learning_rate": 2.5863770977295165e-05, "loss": 3.2844, "step": 21500 }, { "epoch": 0.0529708521866428, "grad_norm": 21.17293930053711, "learning_rate": 2.6464498109936682e-05, "loss": 3.2154, "step": 22000 }, { "epoch": 0.05417473519088468, "grad_norm": 26.187950134277344, "learning_rate": 2.7066429104567452e-05, "loss": 3.2815, "step": 22500 }, { "epoch": 0.05537861819512656, "grad_norm": 27.726953506469727, "learning_rate": 2.766715623720897e-05, "loss": 3.2538, "step": 23000 }, { "epoch": 0.056582501199368446, "grad_norm": 23.557273864746094, "learning_rate": 2.8269087231839743e-05, "loss": 3.2708, "step": 23500 }, { "epoch": 0.057786384203610326, "grad_norm": 22.728757858276367, "learning_rate": 2.8869814364481256e-05, "loss": 3.2052, "step": 24000 }, { "epoch": 0.05899026720785221, "grad_norm": 22.837238311767578, "learning_rate": 2.9471745359112036e-05, "loss": 3.2341, "step": 24500 }, { "epoch": 0.06019415021209409, "grad_norm": 24.8745059967041, "learning_rate": 3.0073676353742806e-05, "loss": 3.2309, "step": 25000 }, { "epoch": 0.06019415021209409, "eval_runtime": 6182.4703, "eval_samples_per_second": 134.355, "eval_steps_per_second": 33.589, "step": 25000 }, { "epoch": 0.06139803321633597, "grad_norm": 26.50299835205078, "learning_rate": 3.0675607348373586e-05, "loss": 3.2289, "step": 25500 }, { "epoch": 0.06260191622057786, "grad_norm": 26.083864212036133, "learning_rate": 3.1277538343004356e-05, "loss": 3.2321, "step": 26000 }, { "epoch": 0.06380579922481973, "grad_norm": 34.2237434387207, "learning_rate": 3.187946933763514e-05, "loss": 3.2297, "step": 26500 }, { "epoch": 0.06500968222906162, "grad_norm": 18.643739700317383, "learning_rate": 3.248140033226591e-05, "loss": 3.2801, "step": 27000 }, { "epoch": 0.0662135652333035, "grad_norm": 16.629045486450195, "learning_rate": 3.3083331326896686e-05, "loss": 3.2924, "step": 27500 }, { "epoch": 0.06741744823754538, "grad_norm": 24.145009994506836, "learning_rate": 3.368526232152746e-05, "loss": 3.208, "step": 28000 }, { "epoch": 0.06862133124178726, "grad_norm": 27.239717483520508, "learning_rate": 3.428719331615824e-05, "loss": 3.2432, "step": 28500 }, { "epoch": 0.06982521424602914, "grad_norm": 19.675600051879883, "learning_rate": 3.488912431078901e-05, "loss": 3.2389, "step": 29000 }, { "epoch": 0.07102909725027103, "grad_norm": 26.076309204101562, "learning_rate": 3.549105530541979e-05, "loss": 3.2849, "step": 29500 }, { "epoch": 0.0722329802545129, "grad_norm": 26.58043670654297, "learning_rate": 3.60917824380613e-05, "loss": 3.2597, "step": 30000 }, { "epoch": 0.0722329802545129, "eval_runtime": 6142.9637, "eval_samples_per_second": 135.219, "eval_steps_per_second": 33.805, "step": 30000 }, { "epoch": 0.07343686325875479, "grad_norm": 28.792818069458008, "learning_rate": 3.6693713432692076e-05, "loss": 3.2583, "step": 30500 }, { "epoch": 0.07464074626299667, "grad_norm": 15.474958419799805, "learning_rate": 3.72944405653336e-05, "loss": 3.2516, "step": 31000 }, { "epoch": 0.07584462926723855, "grad_norm": 22.783601760864258, "learning_rate": 3.7896371559964367e-05, "loss": 3.252, "step": 31500 }, { "epoch": 0.07704851227148043, "grad_norm": 30.097503662109375, "learning_rate": 3.849830255459514e-05, "loss": 3.2607, "step": 32000 }, { "epoch": 0.07825239527572231, "grad_norm": 16.432775497436523, "learning_rate": 3.910023354922592e-05, "loss": 3.2281, "step": 32500 }, { "epoch": 0.0794562782799642, "grad_norm": 22.113264083862305, "learning_rate": 3.9702164543856697e-05, "loss": 3.1994, "step": 33000 }, { "epoch": 0.08066016128420608, "grad_norm": 23.033098220825195, "learning_rate": 4.0304095538487466e-05, "loss": 3.2641, "step": 33500 }, { "epoch": 0.08186404428844796, "grad_norm": 22.20917510986328, "learning_rate": 4.090602653311825e-05, "loss": 3.2382, "step": 34000 }, { "epoch": 0.08306792729268984, "grad_norm": 29.848487854003906, "learning_rate": 4.150795752774902e-05, "loss": 3.2067, "step": 34500 }, { "epoch": 0.08427181029693172, "grad_norm": 25.557289123535156, "learning_rate": 4.2108684660390533e-05, "loss": 3.1953, "step": 35000 }, { "epoch": 0.08427181029693172, "eval_runtime": 6095.2446, "eval_samples_per_second": 136.278, "eval_steps_per_second": 34.07, "step": 35000 }, { "epoch": 0.0854756933011736, "grad_norm": 20.05970001220703, "learning_rate": 4.271061565502131e-05, "loss": 3.2184, "step": 35500 }, { "epoch": 0.08667957630541549, "grad_norm": 20.745208740234375, "learning_rate": 4.3311342787662824e-05, "loss": 3.2016, "step": 36000 }, { "epoch": 0.08788345930965737, "grad_norm": 24.379274368286133, "learning_rate": 4.39132737822936e-05, "loss": 3.1851, "step": 36500 }, { "epoch": 0.08908734231389925, "grad_norm": 14.577661514282227, "learning_rate": 4.451520477692438e-05, "loss": 3.2191, "step": 37000 }, { "epoch": 0.09029122531814113, "grad_norm": 19.822919845581055, "learning_rate": 4.5117135771555154e-05, "loss": 3.1865, "step": 37500 }, { "epoch": 0.09149510832238301, "grad_norm": 20.753366470336914, "learning_rate": 4.5719066766185924e-05, "loss": 3.238, "step": 38000 }, { "epoch": 0.0926989913266249, "grad_norm": 16.455045700073242, "learning_rate": 4.63209977608167e-05, "loss": 3.2118, "step": 38500 }, { "epoch": 0.09390287433086678, "grad_norm": 22.421308517456055, "learning_rate": 4.692052103146896e-05, "loss": 3.2192, "step": 39000 }, { "epoch": 0.09510675733510866, "grad_norm": 18.18105125427246, "learning_rate": 4.752245202609973e-05, "loss": 3.2, "step": 39500 }, { "epoch": 0.09631064033935054, "grad_norm": 14.840981483459473, "learning_rate": 4.8124383020730504e-05, "loss": 3.1897, "step": 40000 }, { "epoch": 0.09631064033935054, "eval_runtime": 6142.3167, "eval_samples_per_second": 135.233, "eval_steps_per_second": 33.808, "step": 40000 }, { "epoch": 0.09751452334359242, "grad_norm": 16.446571350097656, "learning_rate": 4.872631401536128e-05, "loss": 3.2723, "step": 40500 }, { "epoch": 0.09871840634783431, "grad_norm": 32.157161712646484, "learning_rate": 4.932824500999206e-05, "loss": 3.1675, "step": 41000 }, { "epoch": 0.09992228935207619, "grad_norm": 18.431787490844727, "learning_rate": 4.993017600462283e-05, "loss": 3.2205, "step": 41500 }, { "epoch": 0.10112617235631807, "grad_norm": 21.929658889770508, "learning_rate": 4.994087573470594e-05, "loss": 3.1701, "step": 42000 }, { "epoch": 0.10233005536055995, "grad_norm": 20.648868560791016, "learning_rate": 4.987399308165837e-05, "loss": 3.1764, "step": 42500 }, { "epoch": 0.10353393836480183, "grad_norm": 20.37611198425293, "learning_rate": 4.98071104286108e-05, "loss": 3.1684, "step": 43000 }, { "epoch": 0.10473782136904371, "grad_norm": 15.72383975982666, "learning_rate": 4.974036154086932e-05, "loss": 3.1698, "step": 43500 }, { "epoch": 0.1059417043732856, "grad_norm": 17.074922561645508, "learning_rate": 4.967347888782174e-05, "loss": 3.1913, "step": 44000 }, { "epoch": 0.10714558737752748, "grad_norm": 16.70379066467285, "learning_rate": 4.9606596234774164e-05, "loss": 3.2184, "step": 44500 }, { "epoch": 0.10834947038176936, "grad_norm": 12.970040321350098, "learning_rate": 4.9539713581726594e-05, "loss": 3.1826, "step": 45000 }, { "epoch": 0.10834947038176936, "eval_runtime": 6114.7803, "eval_samples_per_second": 135.842, "eval_steps_per_second": 33.961, "step": 45000 }, { "epoch": 0.10955335338601124, "grad_norm": 17.909025192260742, "learning_rate": 4.9472830928679016e-05, "loss": 3.175, "step": 45500 }, { "epoch": 0.11075723639025312, "grad_norm": 17.254976272583008, "learning_rate": 4.9405948275631445e-05, "loss": 3.1959, "step": 46000 }, { "epoch": 0.111961119394495, "grad_norm": 17.655014038085938, "learning_rate": 4.9339199387889964e-05, "loss": 3.194, "step": 46500 }, { "epoch": 0.11316500239873689, "grad_norm": 18.48583984375, "learning_rate": 4.9272316734842386e-05, "loss": 3.19, "step": 47000 }, { "epoch": 0.11436888540297876, "grad_norm": 13.14960765838623, "learning_rate": 4.920556784710091e-05, "loss": 3.159, "step": 47500 }, { "epoch": 0.11557276840722065, "grad_norm": 13.100486755371094, "learning_rate": 4.913868519405333e-05, "loss": 3.197, "step": 48000 }, { "epoch": 0.11677665141146253, "grad_norm": 20.155548095703125, "learning_rate": 4.9071802541005756e-05, "loss": 3.192, "step": 48500 }, { "epoch": 0.11798053441570441, "grad_norm": 16.038169860839844, "learning_rate": 4.9004919887958185e-05, "loss": 3.1575, "step": 49000 }, { "epoch": 0.1191844174199463, "grad_norm": 22.394641876220703, "learning_rate": 4.893803723491061e-05, "loss": 3.1787, "step": 49500 }, { "epoch": 0.12038830042418817, "grad_norm": 18.06816864013672, "learning_rate": 4.887115458186303e-05, "loss": 3.138, "step": 50000 }, { "epoch": 0.12038830042418817, "eval_runtime": 6098.0137, "eval_samples_per_second": 136.216, "eval_steps_per_second": 34.054, "step": 50000 }, { "epoch": 0.12159218342843006, "grad_norm": 25.612211227416992, "learning_rate": 4.880427192881546e-05, "loss": 3.1291, "step": 50500 }, { "epoch": 0.12279606643267194, "grad_norm": 23.026065826416016, "learning_rate": 4.873738927576788e-05, "loss": 3.1429, "step": 51000 }, { "epoch": 0.12399994943691382, "grad_norm": 17.348302841186523, "learning_rate": 4.86706403880264e-05, "loss": 3.208, "step": 51500 }, { "epoch": 0.1252038324411557, "grad_norm": 25.699726104736328, "learning_rate": 4.860375773497883e-05, "loss": 3.1744, "step": 52000 }, { "epoch": 0.12640771544539758, "grad_norm": 20.069185256958008, "learning_rate": 4.853687508193125e-05, "loss": 3.1429, "step": 52500 }, { "epoch": 0.12761159844963946, "grad_norm": 15.095413208007812, "learning_rate": 4.8469992428883674e-05, "loss": 3.1767, "step": 53000 }, { "epoch": 0.12881548145388136, "grad_norm": 12.854238510131836, "learning_rate": 4.84031097758361e-05, "loss": 3.1726, "step": 53500 }, { "epoch": 0.13001936445812323, "grad_norm": 18.715251922607422, "learning_rate": 4.833622712278853e-05, "loss": 3.1745, "step": 54000 }, { "epoch": 0.1312232474623651, "grad_norm": 16.41513442993164, "learning_rate": 4.8269344469740955e-05, "loss": 3.163, "step": 54500 }, { "epoch": 0.132427130466607, "grad_norm": 16.754322052001953, "learning_rate": 4.820246181669338e-05, "loss": 3.2186, "step": 55000 }, { "epoch": 0.132427130466607, "eval_runtime": 6080.0844, "eval_samples_per_second": 136.617, "eval_steps_per_second": 34.154, "step": 55000 }, { "epoch": 0.13363101347084888, "grad_norm": 21.774137496948242, "learning_rate": 4.8135712928951896e-05, "loss": 3.1601, "step": 55500 }, { "epoch": 0.13483489647509075, "grad_norm": 15.781363487243652, "learning_rate": 4.8068830275904325e-05, "loss": 3.1762, "step": 56000 }, { "epoch": 0.13603877947933263, "grad_norm": 15.000273704528809, "learning_rate": 4.800194762285675e-05, "loss": 3.1576, "step": 56500 }, { "epoch": 0.13724266248357453, "grad_norm": 16.367721557617188, "learning_rate": 4.7935064969809176e-05, "loss": 3.1373, "step": 57000 }, { "epoch": 0.1384465454878164, "grad_norm": 14.089083671569824, "learning_rate": 4.7868316082067695e-05, "loss": 3.1886, "step": 57500 }, { "epoch": 0.13965042849205828, "grad_norm": 15.01375675201416, "learning_rate": 4.780143342902012e-05, "loss": 3.1041, "step": 58000 }, { "epoch": 0.14085431149630018, "grad_norm": 15.857519149780273, "learning_rate": 4.773455077597254e-05, "loss": 3.157, "step": 58500 }, { "epoch": 0.14205819450054205, "grad_norm": 15.649994850158691, "learning_rate": 4.7667801888231065e-05, "loss": 3.1307, "step": 59000 }, { "epoch": 0.14326207750478392, "grad_norm": 17.55912208557129, "learning_rate": 4.760091923518349e-05, "loss": 3.1491, "step": 59500 }, { "epoch": 0.1444659605090258, "grad_norm": 39.259796142578125, "learning_rate": 4.753403658213592e-05, "loss": 3.156, "step": 60000 }, { "epoch": 0.1444659605090258, "eval_runtime": 6118.114, "eval_samples_per_second": 135.768, "eval_steps_per_second": 33.942, "step": 60000 }, { "epoch": 0.1456698435132677, "grad_norm": 14.720120429992676, "learning_rate": 4.746715392908834e-05, "loss": 3.1288, "step": 60500 }, { "epoch": 0.14687372651750957, "grad_norm": 17.65592384338379, "learning_rate": 4.740027127604076e-05, "loss": 3.1324, "step": 61000 }, { "epoch": 0.14807760952175145, "grad_norm": 22.232887268066406, "learning_rate": 4.7333388622993184e-05, "loss": 3.1047, "step": 61500 }, { "epoch": 0.14928149252599335, "grad_norm": 13.700913429260254, "learning_rate": 4.726650596994561e-05, "loss": 3.1472, "step": 62000 }, { "epoch": 0.15048537553023522, "grad_norm": 12.120200157165527, "learning_rate": 4.719962331689804e-05, "loss": 3.1314, "step": 62500 }, { "epoch": 0.1516892585344771, "grad_norm": 18.1831111907959, "learning_rate": 4.713287442915656e-05, "loss": 3.1473, "step": 63000 }, { "epoch": 0.152893141538719, "grad_norm": 16.345584869384766, "learning_rate": 4.706599177610898e-05, "loss": 3.0957, "step": 63500 }, { "epoch": 0.15409702454296087, "grad_norm": 12.382160186767578, "learning_rate": 4.6999109123061405e-05, "loss": 3.1352, "step": 64000 }, { "epoch": 0.15530090754720274, "grad_norm": 11.769241333007812, "learning_rate": 4.6932226470013834e-05, "loss": 3.1241, "step": 64500 }, { "epoch": 0.15650479055144462, "grad_norm": 16.600685119628906, "learning_rate": 4.686547758227235e-05, "loss": 3.1001, "step": 65000 }, { "epoch": 0.15650479055144462, "eval_runtime": 6099.2, "eval_samples_per_second": 136.189, "eval_steps_per_second": 34.047, "step": 65000 }, { "epoch": 0.15770867355568652, "grad_norm": 27.690532684326172, "learning_rate": 4.6798594929224775e-05, "loss": 3.1427, "step": 65500 }, { "epoch": 0.1589125565599284, "grad_norm": 16.452138900756836, "learning_rate": 4.6731712276177205e-05, "loss": 3.1489, "step": 66000 }, { "epoch": 0.16011643956417027, "grad_norm": 22.97121238708496, "learning_rate": 4.666482962312963e-05, "loss": 3.1554, "step": 66500 }, { "epoch": 0.16132032256841217, "grad_norm": 17.040058135986328, "learning_rate": 4.659808073538815e-05, "loss": 3.1619, "step": 67000 }, { "epoch": 0.16252420557265404, "grad_norm": 15.484249114990234, "learning_rate": 4.653119808234057e-05, "loss": 3.1277, "step": 67500 }, { "epoch": 0.1637280885768959, "grad_norm": 14.11099910736084, "learning_rate": 4.6464315429293e-05, "loss": 3.1736, "step": 68000 }, { "epoch": 0.16493197158113782, "grad_norm": 13.42932415008545, "learning_rate": 4.6397432776245426e-05, "loss": 3.1114, "step": 68500 }, { "epoch": 0.1661358545853797, "grad_norm": 14.439802169799805, "learning_rate": 4.633055012319785e-05, "loss": 3.1526, "step": 69000 }, { "epoch": 0.16733973758962156, "grad_norm": 19.759899139404297, "learning_rate": 4.626366747015027e-05, "loss": 3.13, "step": 69500 }, { "epoch": 0.16854362059386344, "grad_norm": 14.265944480895996, "learning_rate": 4.61967848171027e-05, "loss": 3.1197, "step": 70000 }, { "epoch": 0.16854362059386344, "eval_runtime": 6159.4194, "eval_samples_per_second": 134.858, "eval_steps_per_second": 33.715, "step": 70000 }, { "epoch": 0.16974750359810534, "grad_norm": 14.379167556762695, "learning_rate": 4.612990216405512e-05, "loss": 3.1587, "step": 70500 }, { "epoch": 0.1709513866023472, "grad_norm": 16.7622013092041, "learning_rate": 4.606315327631364e-05, "loss": 3.1265, "step": 71000 }, { "epoch": 0.17215526960658908, "grad_norm": 16.946565628051758, "learning_rate": 4.599627062326607e-05, "loss": 3.1666, "step": 71500 }, { "epoch": 0.17335915261083099, "grad_norm": 15.118630409240723, "learning_rate": 4.592952173552459e-05, "loss": 3.1067, "step": 72000 }, { "epoch": 0.17456303561507286, "grad_norm": 13.015983581542969, "learning_rate": 4.586263908247702e-05, "loss": 3.0988, "step": 72500 }, { "epoch": 0.17576691861931473, "grad_norm": 16.574451446533203, "learning_rate": 4.579575642942944e-05, "loss": 3.1373, "step": 73000 }, { "epoch": 0.1769708016235566, "grad_norm": 13.634519577026367, "learning_rate": 4.572887377638186e-05, "loss": 3.1362, "step": 73500 }, { "epoch": 0.1781746846277985, "grad_norm": 19.195165634155273, "learning_rate": 4.566199112333429e-05, "loss": 3.0999, "step": 74000 }, { "epoch": 0.17937856763204038, "grad_norm": 16.080059051513672, "learning_rate": 4.5595108470286714e-05, "loss": 3.1615, "step": 74500 }, { "epoch": 0.18058245063628225, "grad_norm": 17.560720443725586, "learning_rate": 4.5528225817239137e-05, "loss": 3.1018, "step": 75000 }, { "epoch": 0.18058245063628225, "eval_runtime": 6167.2803, "eval_samples_per_second": 134.686, "eval_steps_per_second": 33.672, "step": 75000 }, { "epoch": 0.18178633364052416, "grad_norm": 14.580060005187988, "learning_rate": 4.5461343164191566e-05, "loss": 3.1417, "step": 75500 }, { "epoch": 0.18299021664476603, "grad_norm": 20.411680221557617, "learning_rate": 4.5394594276450084e-05, "loss": 3.1915, "step": 76000 }, { "epoch": 0.1841940996490079, "grad_norm": 24.153568267822266, "learning_rate": 4.53278453887086e-05, "loss": 3.129, "step": 76500 }, { "epoch": 0.1853979826532498, "grad_norm": 20.700895309448242, "learning_rate": 4.5260962735661025e-05, "loss": 3.1237, "step": 77000 }, { "epoch": 0.18660186565749168, "grad_norm": 19.473398208618164, "learning_rate": 4.5194080082613455e-05, "loss": 3.1593, "step": 77500 }, { "epoch": 0.18780574866173355, "grad_norm": 16.656599044799805, "learning_rate": 4.5127197429565884e-05, "loss": 3.1273, "step": 78000 }, { "epoch": 0.18900963166597542, "grad_norm": 17.83378028869629, "learning_rate": 4.5060314776518306e-05, "loss": 3.1454, "step": 78500 }, { "epoch": 0.19021351467021733, "grad_norm": 12.069873809814453, "learning_rate": 4.499343212347073e-05, "loss": 3.1567, "step": 79000 }, { "epoch": 0.1914173976744592, "grad_norm": 21.635231018066406, "learning_rate": 4.492654947042316e-05, "loss": 3.1063, "step": 79500 }, { "epoch": 0.19262128067870107, "grad_norm": 12.80612564086914, "learning_rate": 4.4859800582681676e-05, "loss": 3.1169, "step": 80000 }, { "epoch": 0.19262128067870107, "eval_runtime": 6089.7621, "eval_samples_per_second": 136.4, "eval_steps_per_second": 34.1, "step": 80000 }, { "epoch": 0.19382516368294297, "grad_norm": 15.964377403259277, "learning_rate": 4.47929179296341e-05, "loss": 3.1736, "step": 80500 }, { "epoch": 0.19502904668718485, "grad_norm": 33.44254684448242, "learning_rate": 4.472603527658653e-05, "loss": 3.1525, "step": 81000 }, { "epoch": 0.19623292969142672, "grad_norm": 13.991809844970703, "learning_rate": 4.465915262353895e-05, "loss": 3.1492, "step": 81500 }, { "epoch": 0.19743681269566862, "grad_norm": 12.851255416870117, "learning_rate": 4.4592403735797475e-05, "loss": 3.1153, "step": 82000 }, { "epoch": 0.1986406956999105, "grad_norm": 17.928274154663086, "learning_rate": 4.452552108274989e-05, "loss": 3.1518, "step": 82500 }, { "epoch": 0.19984457870415237, "grad_norm": 12.124229431152344, "learning_rate": 4.445863842970232e-05, "loss": 3.1087, "step": 83000 }, { "epoch": 0.20104846170839424, "grad_norm": 15.766402244567871, "learning_rate": 4.439175577665475e-05, "loss": 3.1327, "step": 83500 }, { "epoch": 0.20225234471263615, "grad_norm": 16.555757522583008, "learning_rate": 4.432487312360717e-05, "loss": 3.0765, "step": 84000 }, { "epoch": 0.20345622771687802, "grad_norm": 19.65941619873047, "learning_rate": 4.4257990470559594e-05, "loss": 3.1393, "step": 84500 }, { "epoch": 0.2046601107211199, "grad_norm": 16.987285614013672, "learning_rate": 4.419110781751202e-05, "loss": 3.1145, "step": 85000 }, { "epoch": 0.2046601107211199, "eval_runtime": 6166.7962, "eval_samples_per_second": 134.696, "eval_steps_per_second": 33.674, "step": 85000 }, { "epoch": 0.2058639937253618, "grad_norm": 14.441193580627441, "learning_rate": 4.4124225164464445e-05, "loss": 3.1662, "step": 85500 }, { "epoch": 0.20706787672960367, "grad_norm": 18.12236976623535, "learning_rate": 4.4057476276722964e-05, "loss": 3.0565, "step": 86000 }, { "epoch": 0.20827175973384554, "grad_norm": 12.500991821289062, "learning_rate": 4.399059362367539e-05, "loss": 3.1047, "step": 86500 }, { "epoch": 0.20947564273808741, "grad_norm": 16.244428634643555, "learning_rate": 4.3923710970627816e-05, "loss": 3.0893, "step": 87000 }, { "epoch": 0.21067952574232932, "grad_norm": 21.911731719970703, "learning_rate": 4.385682831758024e-05, "loss": 3.0743, "step": 87500 }, { "epoch": 0.2118834087465712, "grad_norm": 16.75537109375, "learning_rate": 4.378994566453267e-05, "loss": 3.1221, "step": 88000 }, { "epoch": 0.21308729175081306, "grad_norm": 20.14570426940918, "learning_rate": 4.3723063011485096e-05, "loss": 3.1413, "step": 88500 }, { "epoch": 0.21429117475505496, "grad_norm": 14.766070365905762, "learning_rate": 4.365618035843751e-05, "loss": 3.0955, "step": 89000 }, { "epoch": 0.21549505775929684, "grad_norm": 17.830801010131836, "learning_rate": 4.358929770538994e-05, "loss": 3.1517, "step": 89500 }, { "epoch": 0.2166989407635387, "grad_norm": 10.205118179321289, "learning_rate": 4.352254881764846e-05, "loss": 3.1332, "step": 90000 }, { "epoch": 0.2166989407635387, "eval_runtime": 6149.5749, "eval_samples_per_second": 135.074, "eval_steps_per_second": 33.769, "step": 90000 }, { "epoch": 0.2179028237677806, "grad_norm": 16.20384979248047, "learning_rate": 4.345566616460089e-05, "loss": 3.1003, "step": 90500 }, { "epoch": 0.21910670677202249, "grad_norm": 17.35607147216797, "learning_rate": 4.338878351155331e-05, "loss": 3.1193, "step": 91000 }, { "epoch": 0.22031058977626436, "grad_norm": 17.914997100830078, "learning_rate": 4.332190085850574e-05, "loss": 3.0944, "step": 91500 }, { "epoch": 0.22151447278050623, "grad_norm": 23.45078468322754, "learning_rate": 4.325515197076426e-05, "loss": 3.1518, "step": 92000 }, { "epoch": 0.22271835578474813, "grad_norm": 19.160053253173828, "learning_rate": 4.318826931771668e-05, "loss": 3.1144, "step": 92500 }, { "epoch": 0.22392223878899, "grad_norm": 16.796180725097656, "learning_rate": 4.312152042997521e-05, "loss": 3.1354, "step": 93000 }, { "epoch": 0.22512612179323188, "grad_norm": 13.598986625671387, "learning_rate": 4.305463777692762e-05, "loss": 3.0675, "step": 93500 }, { "epoch": 0.22633000479747378, "grad_norm": 16.168975830078125, "learning_rate": 4.298775512388005e-05, "loss": 3.1065, "step": 94000 }, { "epoch": 0.22753388780171566, "grad_norm": 22.480331420898438, "learning_rate": 4.292087247083248e-05, "loss": 3.1201, "step": 94500 }, { "epoch": 0.22873777080595753, "grad_norm": 16.593976974487305, "learning_rate": 4.28539898177849e-05, "loss": 3.1264, "step": 95000 }, { "epoch": 0.22873777080595753, "eval_runtime": 6100.46, "eval_samples_per_second": 136.161, "eval_steps_per_second": 34.04, "step": 95000 }, { "epoch": 0.2299416538101994, "grad_norm": 14.308032989501953, "learning_rate": 4.2787107164737325e-05, "loss": 3.1278, "step": 95500 }, { "epoch": 0.2311455368144413, "grad_norm": 13.68152141571045, "learning_rate": 4.272035827699585e-05, "loss": 3.1079, "step": 96000 }, { "epoch": 0.23234941981868318, "grad_norm": 15.30040454864502, "learning_rate": 4.265347562394827e-05, "loss": 3.0765, "step": 96500 }, { "epoch": 0.23355330282292505, "grad_norm": 17.36260223388672, "learning_rate": 4.2586592970900695e-05, "loss": 3.0966, "step": 97000 }, { "epoch": 0.23475718582716695, "grad_norm": 16.50679588317871, "learning_rate": 4.2519710317853125e-05, "loss": 3.1462, "step": 97500 }, { "epoch": 0.23596106883140883, "grad_norm": 15.678003311157227, "learning_rate": 4.245282766480555e-05, "loss": 3.1434, "step": 98000 }, { "epoch": 0.2371649518356507, "grad_norm": 14.652356147766113, "learning_rate": 4.238594501175797e-05, "loss": 3.1461, "step": 98500 }, { "epoch": 0.2383688348398926, "grad_norm": 13.707479476928711, "learning_rate": 4.23190623587104e-05, "loss": 3.0894, "step": 99000 }, { "epoch": 0.23957271784413448, "grad_norm": 22.13295555114746, "learning_rate": 4.225217970566282e-05, "loss": 3.1317, "step": 99500 }, { "epoch": 0.24077660084837635, "grad_norm": 14.54344367980957, "learning_rate": 4.2185430817921346e-05, "loss": 3.1209, "step": 100000 }, { "epoch": 0.24077660084837635, "eval_runtime": 6233.0245, "eval_samples_per_second": 133.265, "eval_steps_per_second": 33.316, "step": 100000 }, { "epoch": 0.24198048385261822, "grad_norm": 16.891630172729492, "learning_rate": 4.211854816487377e-05, "loss": 3.1261, "step": 100500 }, { "epoch": 0.24318436685686012, "grad_norm": 17.46337127685547, "learning_rate": 4.205166551182619e-05, "loss": 3.1625, "step": 101000 }, { "epoch": 0.244388249861102, "grad_norm": 14.349138259887695, "learning_rate": 4.1984916624084716e-05, "loss": 3.0834, "step": 101500 }, { "epoch": 0.24559213286534387, "grad_norm": 18.939817428588867, "learning_rate": 4.191803397103714e-05, "loss": 3.1521, "step": 102000 }, { "epoch": 0.24679601586958577, "grad_norm": 16.54868507385254, "learning_rate": 4.185115131798956e-05, "loss": 3.0694, "step": 102500 }, { "epoch": 0.24799989887382765, "grad_norm": 14.203706741333008, "learning_rate": 4.178426866494199e-05, "loss": 3.1201, "step": 103000 }, { "epoch": 0.24920378187806952, "grad_norm": 14.797431945800781, "learning_rate": 4.171738601189441e-05, "loss": 3.1252, "step": 103500 }, { "epoch": 0.2504076648823114, "grad_norm": 14.449517250061035, "learning_rate": 4.165063712415294e-05, "loss": 3.0932, "step": 104000 }, { "epoch": 0.25161154788655327, "grad_norm": 17.101430892944336, "learning_rate": 4.158375447110536e-05, "loss": 3.1127, "step": 104500 }, { "epoch": 0.25281543089079517, "grad_norm": 20.582412719726562, "learning_rate": 4.151700558336388e-05, "loss": 3.0675, "step": 105000 }, { "epoch": 0.25281543089079517, "eval_runtime": 6156.9182, "eval_samples_per_second": 134.912, "eval_steps_per_second": 33.728, "step": 105000 }, { "epoch": 0.25401931389503707, "grad_norm": 14.351494789123535, "learning_rate": 4.14501229303163e-05, "loss": 3.0845, "step": 105500 }, { "epoch": 0.2552231968992789, "grad_norm": 11.951766967773438, "learning_rate": 4.138324027726873e-05, "loss": 3.0907, "step": 106000 }, { "epoch": 0.2564270799035208, "grad_norm": 13.831068992614746, "learning_rate": 4.131635762422115e-05, "loss": 3.1139, "step": 106500 }, { "epoch": 0.2576309629077627, "grad_norm": 16.089948654174805, "learning_rate": 4.124947497117358e-05, "loss": 3.085, "step": 107000 }, { "epoch": 0.25883484591200456, "grad_norm": 16.427217483520508, "learning_rate": 4.1182592318126004e-05, "loss": 3.1444, "step": 107500 }, { "epoch": 0.26003872891624646, "grad_norm": 16.443748474121094, "learning_rate": 4.111570966507843e-05, "loss": 3.1197, "step": 108000 }, { "epoch": 0.26124261192048837, "grad_norm": 12.318251609802246, "learning_rate": 4.1048827012030856e-05, "loss": 3.0734, "step": 108500 }, { "epoch": 0.2624464949247302, "grad_norm": 13.695268630981445, "learning_rate": 4.098194435898328e-05, "loss": 3.1275, "step": 109000 }, { "epoch": 0.2636503779289721, "grad_norm": 15.07443904876709, "learning_rate": 4.09151954712418e-05, "loss": 3.097, "step": 109500 }, { "epoch": 0.264854260933214, "grad_norm": 15.240448951721191, "learning_rate": 4.0848312818194226e-05, "loss": 3.088, "step": 110000 }, { "epoch": 0.264854260933214, "eval_runtime": 6153.3584, "eval_samples_per_second": 134.991, "eval_steps_per_second": 33.748, "step": 110000 }, { "epoch": 0.26605814393745586, "grad_norm": 13.12667179107666, "learning_rate": 4.078250028759541e-05, "loss": 3.0962, "step": 110500 }, { "epoch": 0.26726202694169776, "grad_norm": 17.520675659179688, "learning_rate": 4.0715617634547834e-05, "loss": 3.0786, "step": 111000 }, { "epoch": 0.2684659099459396, "grad_norm": 27.284038543701172, "learning_rate": 4.064886874680636e-05, "loss": 3.1162, "step": 111500 }, { "epoch": 0.2696697929501815, "grad_norm": 12.623812675476074, "learning_rate": 4.0581986093758775e-05, "loss": 3.0993, "step": 112000 }, { "epoch": 0.2708736759544234, "grad_norm": 14.702446937561035, "learning_rate": 4.0515103440711204e-05, "loss": 3.0733, "step": 112500 }, { "epoch": 0.27207755895866526, "grad_norm": 16.056833267211914, "learning_rate": 4.0448220787663634e-05, "loss": 3.0788, "step": 113000 }, { "epoch": 0.27328144196290716, "grad_norm": 12.753098487854004, "learning_rate": 4.038147189992215e-05, "loss": 3.0991, "step": 113500 }, { "epoch": 0.27448532496714906, "grad_norm": 13.137269020080566, "learning_rate": 4.0314589246874575e-05, "loss": 3.0871, "step": 114000 }, { "epoch": 0.2756892079713909, "grad_norm": 15.072389602661133, "learning_rate": 4.0247706593827004e-05, "loss": 3.115, "step": 114500 }, { "epoch": 0.2768930909756328, "grad_norm": 15.979447364807129, "learning_rate": 4.0180823940779426e-05, "loss": 3.1002, "step": 115000 }, { "epoch": 0.2768930909756328, "eval_runtime": 6179.6049, "eval_samples_per_second": 134.417, "eval_steps_per_second": 33.604, "step": 115000 }, { "epoch": 0.2780969739798747, "grad_norm": 13.973761558532715, "learning_rate": 4.011394128773185e-05, "loss": 3.0706, "step": 115500 }, { "epoch": 0.27930085698411655, "grad_norm": 16.156885147094727, "learning_rate": 4.004705863468428e-05, "loss": 3.0595, "step": 116000 }, { "epoch": 0.28050473998835845, "grad_norm": 14.320749282836914, "learning_rate": 3.99801759816367e-05, "loss": 3.1083, "step": 116500 }, { "epoch": 0.28170862299260035, "grad_norm": 13.002079010009766, "learning_rate": 3.991329332858912e-05, "loss": 3.0554, "step": 117000 }, { "epoch": 0.2829125059968422, "grad_norm": 19.574172973632812, "learning_rate": 3.984654444084764e-05, "loss": 3.1074, "step": 117500 }, { "epoch": 0.2841163890010841, "grad_norm": 12.356159210205078, "learning_rate": 3.977966178780007e-05, "loss": 3.1215, "step": 118000 }, { "epoch": 0.285320272005326, "grad_norm": 17.327226638793945, "learning_rate": 3.97127791347525e-05, "loss": 3.047, "step": 118500 }, { "epoch": 0.28652415500956785, "grad_norm": 16.561124801635742, "learning_rate": 3.964589648170492e-05, "loss": 3.1006, "step": 119000 }, { "epoch": 0.28772803801380975, "grad_norm": 14.118390083312988, "learning_rate": 3.9579013828657344e-05, "loss": 3.08, "step": 119500 }, { "epoch": 0.2889319210180516, "grad_norm": 15.130383491516113, "learning_rate": 3.951213117560977e-05, "loss": 3.0229, "step": 120000 }, { "epoch": 0.2889319210180516, "eval_runtime": 6265.7809, "eval_samples_per_second": 132.568, "eval_steps_per_second": 33.142, "step": 120000 }, { "epoch": 0.2901358040222935, "grad_norm": 20.27661895751953, "learning_rate": 3.944538228786829e-05, "loss": 3.0565, "step": 120500 }, { "epoch": 0.2913396870265354, "grad_norm": 15.461856842041016, "learning_rate": 3.9378499634820714e-05, "loss": 3.0717, "step": 121000 }, { "epoch": 0.29254357003077724, "grad_norm": 17.019287109375, "learning_rate": 3.931161698177314e-05, "loss": 3.1387, "step": 121500 }, { "epoch": 0.29374745303501915, "grad_norm": 18.06890106201172, "learning_rate": 3.9244734328725566e-05, "loss": 3.1166, "step": 122000 }, { "epoch": 0.29495133603926105, "grad_norm": 31.920703887939453, "learning_rate": 3.917798544098409e-05, "loss": 3.095, "step": 122500 }, { "epoch": 0.2961552190435029, "grad_norm": 15.199366569519043, "learning_rate": 3.9111102787936507e-05, "loss": 3.0706, "step": 123000 }, { "epoch": 0.2973591020477448, "grad_norm": 15.413779258728027, "learning_rate": 3.9044220134888936e-05, "loss": 3.121, "step": 123500 }, { "epoch": 0.2985629850519867, "grad_norm": 14.4086275100708, "learning_rate": 3.8977337481841365e-05, "loss": 3.087, "step": 124000 }, { "epoch": 0.29976686805622854, "grad_norm": 12.95889663696289, "learning_rate": 3.891045482879379e-05, "loss": 3.0934, "step": 124500 }, { "epoch": 0.30097075106047044, "grad_norm": 19.025604248046875, "learning_rate": 3.884357217574621e-05, "loss": 3.1332, "step": 125000 }, { "epoch": 0.30097075106047044, "eval_runtime": 6218.4719, "eval_samples_per_second": 133.577, "eval_steps_per_second": 33.394, "step": 125000 }, { "epoch": 0.30217463406471234, "grad_norm": 14.700455665588379, "learning_rate": 3.877668952269864e-05, "loss": 3.0799, "step": 125500 }, { "epoch": 0.3033785170689542, "grad_norm": 15.362942695617676, "learning_rate": 3.870994063495716e-05, "loss": 3.0551, "step": 126000 }, { "epoch": 0.3045824000731961, "grad_norm": 18.218399047851562, "learning_rate": 3.864305798190958e-05, "loss": 3.0529, "step": 126500 }, { "epoch": 0.305786283077438, "grad_norm": 18.461824417114258, "learning_rate": 3.857617532886201e-05, "loss": 3.1065, "step": 127000 }, { "epoch": 0.30699016608167984, "grad_norm": 12.244810104370117, "learning_rate": 3.850929267581443e-05, "loss": 3.0844, "step": 127500 }, { "epoch": 0.30819404908592174, "grad_norm": 20.86441993713379, "learning_rate": 3.8442410022766854e-05, "loss": 3.0551, "step": 128000 }, { "epoch": 0.30939793209016364, "grad_norm": 16.215953826904297, "learning_rate": 3.837552736971928e-05, "loss": 3.0748, "step": 128500 }, { "epoch": 0.3106018150944055, "grad_norm": 17.1651554107666, "learning_rate": 3.8308644716671705e-05, "loss": 3.144, "step": 129000 }, { "epoch": 0.3118056980986474, "grad_norm": 22.377321243286133, "learning_rate": 3.8241762063624134e-05, "loss": 3.1162, "step": 129500 }, { "epoch": 0.31300958110288923, "grad_norm": 21.55461883544922, "learning_rate": 3.817501317588265e-05, "loss": 3.1048, "step": 130000 }, { "epoch": 0.31300958110288923, "eval_runtime": 6198.7963, "eval_samples_per_second": 134.001, "eval_steps_per_second": 33.5, "step": 130000 }, { "epoch": 0.31421346410713114, "grad_norm": 17.96697425842285, "learning_rate": 3.8108130522835075e-05, "loss": 3.0576, "step": 130500 }, { "epoch": 0.31541734711137304, "grad_norm": 15.112616539001465, "learning_rate": 3.80412478697875e-05, "loss": 3.1265, "step": 131000 }, { "epoch": 0.3166212301156149, "grad_norm": 15.317338943481445, "learning_rate": 3.797449898204602e-05, "loss": 3.0716, "step": 131500 }, { "epoch": 0.3178251131198568, "grad_norm": 14.246545791625977, "learning_rate": 3.7907616328998445e-05, "loss": 3.1111, "step": 132000 }, { "epoch": 0.3190289961240987, "grad_norm": 14.737203598022461, "learning_rate": 3.7840733675950874e-05, "loss": 3.1051, "step": 132500 }, { "epoch": 0.32023287912834053, "grad_norm": 16.053455352783203, "learning_rate": 3.77738510229033e-05, "loss": 3.0498, "step": 133000 }, { "epoch": 0.32143676213258243, "grad_norm": 15.171459197998047, "learning_rate": 3.770696836985572e-05, "loss": 3.0535, "step": 133500 }, { "epoch": 0.32264064513682433, "grad_norm": 23.735517501831055, "learning_rate": 3.7640219482114245e-05, "loss": 3.0349, "step": 134000 }, { "epoch": 0.3238445281410662, "grad_norm": 13.836942672729492, "learning_rate": 3.757333682906667e-05, "loss": 3.0985, "step": 134500 }, { "epoch": 0.3250484111453081, "grad_norm": 15.954339027404785, "learning_rate": 3.750645417601909e-05, "loss": 3.0927, "step": 135000 }, { "epoch": 0.3250484111453081, "eval_runtime": 6258.0775, "eval_samples_per_second": 132.732, "eval_steps_per_second": 33.183, "step": 135000 }, { "epoch": 0.32625229414955, "grad_norm": 23.13224983215332, "learning_rate": 3.7439705288277615e-05, "loss": 3.0961, "step": 135500 }, { "epoch": 0.3274561771537918, "grad_norm": 11.840916633605957, "learning_rate": 3.737282263523004e-05, "loss": 3.0769, "step": 136000 }, { "epoch": 0.32866006015803373, "grad_norm": 11.10158634185791, "learning_rate": 3.7305939982182466e-05, "loss": 3.0942, "step": 136500 }, { "epoch": 0.32986394316227563, "grad_norm": 14.162835121154785, "learning_rate": 3.723905732913489e-05, "loss": 3.1289, "step": 137000 }, { "epoch": 0.3310678261665175, "grad_norm": 23.765029907226562, "learning_rate": 3.717217467608731e-05, "loss": 3.0774, "step": 137500 }, { "epoch": 0.3322717091707594, "grad_norm": 22.40215492248535, "learning_rate": 3.710542578834583e-05, "loss": 3.0886, "step": 138000 }, { "epoch": 0.3334755921750012, "grad_norm": 16.616819381713867, "learning_rate": 3.703854313529826e-05, "loss": 3.102, "step": 138500 }, { "epoch": 0.3346794751792431, "grad_norm": 19.094507217407227, "learning_rate": 3.697166048225068e-05, "loss": 3.1027, "step": 139000 }, { "epoch": 0.335883358183485, "grad_norm": 20.761945724487305, "learning_rate": 3.690477782920311e-05, "loss": 3.0609, "step": 139500 }, { "epoch": 0.33708724118772687, "grad_norm": 11.371627807617188, "learning_rate": 3.683789517615553e-05, "loss": 3.0916, "step": 140000 }, { "epoch": 0.33708724118772687, "eval_runtime": 6174.667, "eval_samples_per_second": 134.525, "eval_steps_per_second": 33.631, "step": 140000 }, { "epoch": 0.3382911241919688, "grad_norm": 15.36569881439209, "learning_rate": 3.6771012523107955e-05, "loss": 3.0964, "step": 140500 }, { "epoch": 0.3394950071962107, "grad_norm": 19.703203201293945, "learning_rate": 3.6704129870060384e-05, "loss": 3.0631, "step": 141000 }, { "epoch": 0.3406988902004525, "grad_norm": 23.92881965637207, "learning_rate": 3.663724721701281e-05, "loss": 3.0702, "step": 141500 }, { "epoch": 0.3419027732046944, "grad_norm": 18.54579734802246, "learning_rate": 3.657036456396523e-05, "loss": 3.0732, "step": 142000 }, { "epoch": 0.3431066562089363, "grad_norm": 13.281709671020508, "learning_rate": 3.650348191091766e-05, "loss": 3.0937, "step": 142500 }, { "epoch": 0.34431053921317817, "grad_norm": 17.042314529418945, "learning_rate": 3.6436733023176177e-05, "loss": 3.0914, "step": 143000 }, { "epoch": 0.34551442221742007, "grad_norm": 16.268789291381836, "learning_rate": 3.6369850370128606e-05, "loss": 3.0899, "step": 143500 }, { "epoch": 0.34671830522166197, "grad_norm": 26.38330841064453, "learning_rate": 3.630296771708103e-05, "loss": 3.0666, "step": 144000 }, { "epoch": 0.3479221882259038, "grad_norm": 14.961106300354004, "learning_rate": 3.623608506403345e-05, "loss": 3.069, "step": 144500 }, { "epoch": 0.3491260712301457, "grad_norm": 12.415295600891113, "learning_rate": 3.616920241098588e-05, "loss": 3.0293, "step": 145000 }, { "epoch": 0.3491260712301457, "eval_runtime": 6109.5629, "eval_samples_per_second": 135.958, "eval_steps_per_second": 33.99, "step": 145000 }, { "epoch": 0.3503299542343876, "grad_norm": 16.554115295410156, "learning_rate": 3.61024535232444e-05, "loss": 3.0739, "step": 145500 }, { "epoch": 0.35153383723862947, "grad_norm": 20.627267837524414, "learning_rate": 3.603557087019682e-05, "loss": 3.0799, "step": 146000 }, { "epoch": 0.35273772024287137, "grad_norm": 15.106368064880371, "learning_rate": 3.596868821714925e-05, "loss": 3.0417, "step": 146500 }, { "epoch": 0.3539416032471132, "grad_norm": 17.705570220947266, "learning_rate": 3.590180556410168e-05, "loss": 3.0896, "step": 147000 }, { "epoch": 0.3551454862513551, "grad_norm": 16.01241683959961, "learning_rate": 3.5834922911054094e-05, "loss": 3.0729, "step": 147500 }, { "epoch": 0.356349369255597, "grad_norm": 17.986221313476562, "learning_rate": 3.576817402331262e-05, "loss": 3.11, "step": 148000 }, { "epoch": 0.35755325225983886, "grad_norm": 17.471803665161133, "learning_rate": 3.570129137026504e-05, "loss": 3.0968, "step": 148500 }, { "epoch": 0.35875713526408076, "grad_norm": 16.683828353881836, "learning_rate": 3.563440871721747e-05, "loss": 3.0491, "step": 149000 }, { "epoch": 0.35996101826832266, "grad_norm": 18.689273834228516, "learning_rate": 3.5567526064169894e-05, "loss": 3.0183, "step": 149500 }, { "epoch": 0.3611649012725645, "grad_norm": 14.659083366394043, "learning_rate": 3.550064341112232e-05, "loss": 3.0965, "step": 150000 }, { "epoch": 0.3611649012725645, "eval_runtime": 6228.4893, "eval_samples_per_second": 133.362, "eval_steps_per_second": 33.341, "step": 150000 }, { "epoch": 0.3623687842768064, "grad_norm": 16.2710018157959, "learning_rate": 3.5433760758074745e-05, "loss": 3.1006, "step": 150500 }, { "epoch": 0.3635726672810483, "grad_norm": 16.394590377807617, "learning_rate": 3.5367011870333264e-05, "loss": 3.0602, "step": 151000 }, { "epoch": 0.36477655028529016, "grad_norm": 15.235190391540527, "learning_rate": 3.5300129217285686e-05, "loss": 3.0777, "step": 151500 }, { "epoch": 0.36598043328953206, "grad_norm": 15.201708793640137, "learning_rate": 3.5233246564238115e-05, "loss": 3.0595, "step": 152000 }, { "epoch": 0.36718431629377396, "grad_norm": 22.309728622436523, "learning_rate": 3.5166363911190544e-05, "loss": 3.0446, "step": 152500 }, { "epoch": 0.3683881992980158, "grad_norm": 13.854850769042969, "learning_rate": 3.509961502344906e-05, "loss": 3.0665, "step": 153000 }, { "epoch": 0.3695920823022577, "grad_norm": 14.474712371826172, "learning_rate": 3.5032732370401485e-05, "loss": 3.1098, "step": 153500 }, { "epoch": 0.3707959653064996, "grad_norm": 13.207783699035645, "learning_rate": 3.496584971735391e-05, "loss": 3.1007, "step": 154000 }, { "epoch": 0.37199984831074145, "grad_norm": 13.456844329833984, "learning_rate": 3.489896706430634e-05, "loss": 3.0957, "step": 154500 }, { "epoch": 0.37320373131498336, "grad_norm": 17.590436935424805, "learning_rate": 3.483208441125876e-05, "loss": 3.0295, "step": 155000 }, { "epoch": 0.37320373131498336, "eval_runtime": 6177.0488, "eval_samples_per_second": 134.473, "eval_steps_per_second": 33.618, "step": 155000 }, { "epoch": 0.3744076143192252, "grad_norm": 12.911888122558594, "learning_rate": 3.476520175821119e-05, "loss": 3.0661, "step": 155500 }, { "epoch": 0.3756114973234671, "grad_norm": 14.606691360473633, "learning_rate": 3.469831910516361e-05, "loss": 3.0804, "step": 156000 }, { "epoch": 0.376815380327709, "grad_norm": 18.043087005615234, "learning_rate": 3.463143645211603e-05, "loss": 3.1359, "step": 156500 }, { "epoch": 0.37801926333195085, "grad_norm": 15.033346176147461, "learning_rate": 3.456468756437455e-05, "loss": 2.9907, "step": 157000 }, { "epoch": 0.37922314633619275, "grad_norm": 17.020784378051758, "learning_rate": 3.449780491132698e-05, "loss": 3.0606, "step": 157500 }, { "epoch": 0.38042702934043465, "grad_norm": 22.74751091003418, "learning_rate": 3.44310560235855e-05, "loss": 3.1285, "step": 158000 }, { "epoch": 0.3816309123446765, "grad_norm": 14.052987098693848, "learning_rate": 3.436417337053793e-05, "loss": 3.0703, "step": 158500 }, { "epoch": 0.3828347953489184, "grad_norm": 22.046268463134766, "learning_rate": 3.429729071749035e-05, "loss": 3.0916, "step": 159000 }, { "epoch": 0.3840386783531603, "grad_norm": 23.049739837646484, "learning_rate": 3.4230408064442773e-05, "loss": 3.0909, "step": 159500 }, { "epoch": 0.38524256135740215, "grad_norm": 15.563003540039062, "learning_rate": 3.41635254113952e-05, "loss": 3.051, "step": 160000 }, { "epoch": 0.38524256135740215, "eval_runtime": 6216.4944, "eval_samples_per_second": 133.62, "eval_steps_per_second": 33.405, "step": 160000 }, { "epoch": 0.38644644436164405, "grad_norm": 11.055919647216797, "learning_rate": 3.4096642758347625e-05, "loss": 3.0614, "step": 160500 }, { "epoch": 0.38765032736588595, "grad_norm": 18.309402465820312, "learning_rate": 3.4029760105300054e-05, "loss": 3.0618, "step": 161000 }, { "epoch": 0.3888542103701278, "grad_norm": 15.657028198242188, "learning_rate": 3.396287745225247e-05, "loss": 3.068, "step": 161500 }, { "epoch": 0.3900580933743697, "grad_norm": 15.660598754882812, "learning_rate": 3.3896128564510995e-05, "loss": 3.0956, "step": 162000 }, { "epoch": 0.3912619763786116, "grad_norm": 17.219053268432617, "learning_rate": 3.382924591146342e-05, "loss": 3.0762, "step": 162500 }, { "epoch": 0.39246585938285344, "grad_norm": 15.2114896774292, "learning_rate": 3.376249702372194e-05, "loss": 3.0193, "step": 163000 }, { "epoch": 0.39366974238709534, "grad_norm": 15.437503814697266, "learning_rate": 3.3695614370674365e-05, "loss": 3.0757, "step": 163500 }, { "epoch": 0.39487362539133725, "grad_norm": 17.652286529541016, "learning_rate": 3.3628731717626794e-05, "loss": 3.0871, "step": 164000 }, { "epoch": 0.3960775083955791, "grad_norm": 14.703353881835938, "learning_rate": 3.356184906457922e-05, "loss": 3.0025, "step": 164500 }, { "epoch": 0.397281391399821, "grad_norm": 15.438825607299805, "learning_rate": 3.349496641153164e-05, "loss": 3.049, "step": 165000 }, { "epoch": 0.397281391399821, "eval_runtime": 6142.6208, "eval_samples_per_second": 135.226, "eval_steps_per_second": 33.807, "step": 165000 }, { "epoch": 0.39848527440406284, "grad_norm": 21.73479461669922, "learning_rate": 3.3428217523790165e-05, "loss": 3.0724, "step": 165500 }, { "epoch": 0.39968915740830474, "grad_norm": 13.589031219482422, "learning_rate": 3.336133487074259e-05, "loss": 3.0599, "step": 166000 }, { "epoch": 0.40089304041254664, "grad_norm": 12.588455200195312, "learning_rate": 3.329445221769501e-05, "loss": 3.0674, "step": 166500 }, { "epoch": 0.4020969234167885, "grad_norm": 16.856395721435547, "learning_rate": 3.322756956464744e-05, "loss": 3.0598, "step": 167000 }, { "epoch": 0.4033008064210304, "grad_norm": 14.325052261352539, "learning_rate": 3.316068691159986e-05, "loss": 3.1033, "step": 167500 }, { "epoch": 0.4045046894252723, "grad_norm": 20.509449005126953, "learning_rate": 3.3093938023858386e-05, "loss": 3.0843, "step": 168000 }, { "epoch": 0.40570857242951414, "grad_norm": 17.73023796081543, "learning_rate": 3.302705537081081e-05, "loss": 3.0367, "step": 168500 }, { "epoch": 0.40691245543375604, "grad_norm": 24.057329177856445, "learning_rate": 3.296017271776323e-05, "loss": 3.0771, "step": 169000 }, { "epoch": 0.40811633843799794, "grad_norm": 19.776145935058594, "learning_rate": 3.289329006471566e-05, "loss": 3.0784, "step": 169500 }, { "epoch": 0.4093202214422398, "grad_norm": 23.74951934814453, "learning_rate": 3.282654117697418e-05, "loss": 3.0786, "step": 170000 }, { "epoch": 0.4093202214422398, "eval_runtime": 6188.4105, "eval_samples_per_second": 134.226, "eval_steps_per_second": 33.557, "step": 170000 }, { "epoch": 0.4105241044464817, "grad_norm": 17.745681762695312, "learning_rate": 3.27596585239266e-05, "loss": 3.0666, "step": 170500 }, { "epoch": 0.4117279874507236, "grad_norm": 20.147336959838867, "learning_rate": 3.269277587087903e-05, "loss": 3.1238, "step": 171000 }, { "epoch": 0.41293187045496543, "grad_norm": 16.938888549804688, "learning_rate": 3.262589321783145e-05, "loss": 3.0414, "step": 171500 }, { "epoch": 0.41413575345920733, "grad_norm": 15.663901329040527, "learning_rate": 3.2559010564783875e-05, "loss": 3.0892, "step": 172000 }, { "epoch": 0.41533963646344924, "grad_norm": 16.39117431640625, "learning_rate": 3.2492127911736304e-05, "loss": 3.0685, "step": 172500 }, { "epoch": 0.4165435194676911, "grad_norm": 14.299029350280762, "learning_rate": 3.242537902399482e-05, "loss": 3.0725, "step": 173000 }, { "epoch": 0.417747402471933, "grad_norm": 11.168866157531738, "learning_rate": 3.235849637094725e-05, "loss": 3.0502, "step": 173500 }, { "epoch": 0.41895128547617483, "grad_norm": 13.38841724395752, "learning_rate": 3.2291613717899674e-05, "loss": 3.062, "step": 174000 }, { "epoch": 0.42015516848041673, "grad_norm": 14.151941299438477, "learning_rate": 3.2224731064852097e-05, "loss": 3.0666, "step": 174500 }, { "epoch": 0.42135905148465863, "grad_norm": 17.730104446411133, "learning_rate": 3.215784841180452e-05, "loss": 3.0709, "step": 175000 }, { "epoch": 0.42135905148465863, "eval_runtime": 6186.2143, "eval_samples_per_second": 134.274, "eval_steps_per_second": 33.569, "step": 175000 }, { "epoch": 0.4225629344889005, "grad_norm": 16.822513580322266, "learning_rate": 3.209096575875695e-05, "loss": 3.065, "step": 175500 }, { "epoch": 0.4237668174931424, "grad_norm": 15.454965591430664, "learning_rate": 3.202408310570938e-05, "loss": 3.0476, "step": 176000 }, { "epoch": 0.4249707004973843, "grad_norm": 21.14031410217285, "learning_rate": 3.195720045266179e-05, "loss": 3.0339, "step": 176500 }, { "epoch": 0.4261745835016261, "grad_norm": 19.002689361572266, "learning_rate": 3.189045156492032e-05, "loss": 3.0598, "step": 177000 }, { "epoch": 0.427378466505868, "grad_norm": 11.582403182983398, "learning_rate": 3.182356891187274e-05, "loss": 3.0454, "step": 177500 }, { "epoch": 0.42858234951010993, "grad_norm": 14.35600757598877, "learning_rate": 3.175668625882517e-05, "loss": 3.0677, "step": 178000 }, { "epoch": 0.4297862325143518, "grad_norm": 18.5367374420166, "learning_rate": 3.168980360577759e-05, "loss": 3.1098, "step": 178500 }, { "epoch": 0.4309901155185937, "grad_norm": 17.769344329833984, "learning_rate": 3.162305471803611e-05, "loss": 3.052, "step": 179000 }, { "epoch": 0.4321939985228356, "grad_norm": 17.472938537597656, "learning_rate": 3.155617206498854e-05, "loss": 3.0699, "step": 179500 }, { "epoch": 0.4333978815270774, "grad_norm": 14.995344161987305, "learning_rate": 3.148928941194096e-05, "loss": 3.0682, "step": 180000 }, { "epoch": 0.4333978815270774, "eval_runtime": 6302.7174, "eval_samples_per_second": 131.792, "eval_steps_per_second": 32.948, "step": 180000 }, { "epoch": 0.4346017645313193, "grad_norm": 17.150964736938477, "learning_rate": 3.1422406758893384e-05, "loss": 3.0906, "step": 180500 }, { "epoch": 0.4358056475355612, "grad_norm": 14.804174423217773, "learning_rate": 3.1355524105845814e-05, "loss": 3.0493, "step": 181000 }, { "epoch": 0.43700953053980307, "grad_norm": 17.898832321166992, "learning_rate": 3.128864145279824e-05, "loss": 3.089, "step": 181500 }, { "epoch": 0.43821341354404497, "grad_norm": 16.601884841918945, "learning_rate": 3.122189256505676e-05, "loss": 3.0688, "step": 182000 }, { "epoch": 0.4394172965482868, "grad_norm": 14.000849723815918, "learning_rate": 3.1155009912009184e-05, "loss": 3.0295, "step": 182500 }, { "epoch": 0.4406211795525287, "grad_norm": 17.828115463256836, "learning_rate": 3.1088127258961606e-05, "loss": 3.0588, "step": 183000 }, { "epoch": 0.4418250625567706, "grad_norm": 20.30364418029785, "learning_rate": 3.1021244605914035e-05, "loss": 3.0203, "step": 183500 }, { "epoch": 0.44302894556101247, "grad_norm": 17.606700897216797, "learning_rate": 3.095436195286646e-05, "loss": 3.0568, "step": 184000 }, { "epoch": 0.44423282856525437, "grad_norm": 17.633464813232422, "learning_rate": 3.0887613065124976e-05, "loss": 3.0702, "step": 184500 }, { "epoch": 0.44543671156949627, "grad_norm": 14.55715274810791, "learning_rate": 3.0820730412077405e-05, "loss": 3.0746, "step": 185000 }, { "epoch": 0.44543671156949627, "eval_runtime": 6306.1272, "eval_samples_per_second": 131.72, "eval_steps_per_second": 32.93, "step": 185000 }, { "epoch": 0.4466405945737381, "grad_norm": 16.668909072875977, "learning_rate": 3.075384775902983e-05, "loss": 3.0566, "step": 185500 }, { "epoch": 0.44784447757798, "grad_norm": 14.347661018371582, "learning_rate": 3.068696510598225e-05, "loss": 3.0616, "step": 186000 }, { "epoch": 0.4490483605822219, "grad_norm": 17.429546356201172, "learning_rate": 3.062021621824077e-05, "loss": 3.0875, "step": 186500 }, { "epoch": 0.45025224358646376, "grad_norm": 19.362503051757812, "learning_rate": 3.0553467330499294e-05, "loss": 3.057, "step": 187000 }, { "epoch": 0.45145612659070566, "grad_norm": 14.057225227355957, "learning_rate": 3.048658467745172e-05, "loss": 3.0644, "step": 187500 }, { "epoch": 0.45266000959494757, "grad_norm": 21.090145111083984, "learning_rate": 3.0419702024404146e-05, "loss": 3.0886, "step": 188000 }, { "epoch": 0.4538638925991894, "grad_norm": 13.602699279785156, "learning_rate": 3.0352819371356568e-05, "loss": 3.0649, "step": 188500 }, { "epoch": 0.4550677756034313, "grad_norm": 14.61277961730957, "learning_rate": 3.0285936718308994e-05, "loss": 3.0502, "step": 189000 }, { "epoch": 0.4562716586076732, "grad_norm": 14.571629524230957, "learning_rate": 3.021905406526142e-05, "loss": 3.0512, "step": 189500 }, { "epoch": 0.45747554161191506, "grad_norm": 16.995033264160156, "learning_rate": 3.0152171412213842e-05, "loss": 3.0619, "step": 190000 }, { "epoch": 0.45747554161191506, "eval_runtime": 6119.7371, "eval_samples_per_second": 135.732, "eval_steps_per_second": 33.933, "step": 190000 }, { "epoch": 0.45867942461615696, "grad_norm": 14.749920845031738, "learning_rate": 3.0085288759166268e-05, "loss": 3.0377, "step": 190500 }, { "epoch": 0.4598833076203988, "grad_norm": 18.717721939086914, "learning_rate": 3.0018406106118697e-05, "loss": 3.028, "step": 191000 }, { "epoch": 0.4610871906246407, "grad_norm": 13.981959342956543, "learning_rate": 2.995152345307112e-05, "loss": 3.0643, "step": 191500 }, { "epoch": 0.4622910736288826, "grad_norm": 13.590766906738281, "learning_rate": 2.9884640800023545e-05, "loss": 3.0734, "step": 192000 }, { "epoch": 0.46349495663312446, "grad_norm": 14.754199028015137, "learning_rate": 2.981775814697597e-05, "loss": 3.0575, "step": 192500 }, { "epoch": 0.46469883963736636, "grad_norm": 15.374496459960938, "learning_rate": 2.9751009259234493e-05, "loss": 3.0545, "step": 193000 }, { "epoch": 0.46590272264160826, "grad_norm": 17.713016510009766, "learning_rate": 2.968412660618691e-05, "loss": 3.022, "step": 193500 }, { "epoch": 0.4671066056458501, "grad_norm": 13.752087593078613, "learning_rate": 2.961724395313934e-05, "loss": 3.0129, "step": 194000 }, { "epoch": 0.468310488650092, "grad_norm": 11.1192626953125, "learning_rate": 2.9550361300091767e-05, "loss": 3.0285, "step": 194500 }, { "epoch": 0.4695143716543339, "grad_norm": 17.55103874206543, "learning_rate": 2.9483746177656378e-05, "loss": 3.045, "step": 195000 }, { "epoch": 0.4695143716543339, "eval_runtime": 6132.8059, "eval_samples_per_second": 135.443, "eval_steps_per_second": 33.861, "step": 195000 }, { "epoch": 0.47071825465857575, "grad_norm": 27.24392318725586, "learning_rate": 2.9416863524608807e-05, "loss": 3.0499, "step": 195500 }, { "epoch": 0.47192213766281765, "grad_norm": 14.595544815063477, "learning_rate": 2.9349980871561226e-05, "loss": 3.0375, "step": 196000 }, { "epoch": 0.47312602066705955, "grad_norm": 13.058863639831543, "learning_rate": 2.9283098218513655e-05, "loss": 3.1024, "step": 196500 }, { "epoch": 0.4743299036713014, "grad_norm": 15.837779998779297, "learning_rate": 2.921621556546608e-05, "loss": 3.082, "step": 197000 }, { "epoch": 0.4755337866755433, "grad_norm": 14.441446304321289, "learning_rate": 2.9149466677724603e-05, "loss": 3.0608, "step": 197500 }, { "epoch": 0.4767376696797852, "grad_norm": 16.908939361572266, "learning_rate": 2.9082584024677022e-05, "loss": 3.0524, "step": 198000 }, { "epoch": 0.47794155268402705, "grad_norm": 15.620512962341309, "learning_rate": 2.901570137162945e-05, "loss": 3.0614, "step": 198500 }, { "epoch": 0.47914543568826895, "grad_norm": 17.97640609741211, "learning_rate": 2.8948818718581877e-05, "loss": 3.0483, "step": 199000 }, { "epoch": 0.48034931869251085, "grad_norm": 19.494766235351562, "learning_rate": 2.88819360655343e-05, "loss": 3.0629, "step": 199500 }, { "epoch": 0.4815532016967527, "grad_norm": 18.747150421142578, "learning_rate": 2.8815053412486725e-05, "loss": 3.0774, "step": 200000 }, { "epoch": 0.4815532016967527, "eval_runtime": 6171.1886, "eval_samples_per_second": 134.6, "eval_steps_per_second": 33.65, "step": 200000 }, { "epoch": 0.4827570847009946, "grad_norm": 15.972591400146484, "learning_rate": 2.874817075943915e-05, "loss": 3.0938, "step": 200500 }, { "epoch": 0.48396096770523644, "grad_norm": 16.991474151611328, "learning_rate": 2.8681421871697673e-05, "loss": 3.0431, "step": 201000 }, { "epoch": 0.48516485070947835, "grad_norm": 16.47597312927246, "learning_rate": 2.8614539218650095e-05, "loss": 3.0886, "step": 201500 }, { "epoch": 0.48636873371372025, "grad_norm": 20.3975830078125, "learning_rate": 2.854765656560252e-05, "loss": 3.0562, "step": 202000 }, { "epoch": 0.4875726167179621, "grad_norm": 17.682926177978516, "learning_rate": 2.8480773912554947e-05, "loss": 3.1002, "step": 202500 }, { "epoch": 0.488776499722204, "grad_norm": 18.027238845825195, "learning_rate": 2.841389125950737e-05, "loss": 3.0798, "step": 203000 }, { "epoch": 0.4899803827264459, "grad_norm": 20.950571060180664, "learning_rate": 2.8347142371765888e-05, "loss": 3.0573, "step": 203500 }, { "epoch": 0.49118426573068774, "grad_norm": 17.63266372680664, "learning_rate": 2.8280259718718317e-05, "loss": 3.048, "step": 204000 }, { "epoch": 0.49238814873492964, "grad_norm": 17.037296295166016, "learning_rate": 2.8213377065670743e-05, "loss": 3.016, "step": 204500 }, { "epoch": 0.49359203173917154, "grad_norm": 21.214052200317383, "learning_rate": 2.8146494412623165e-05, "loss": 3.0676, "step": 205000 }, { "epoch": 0.49359203173917154, "eval_runtime": 6343.1785, "eval_samples_per_second": 130.951, "eval_steps_per_second": 32.738, "step": 205000 }, { "epoch": 0.4947959147434134, "grad_norm": 17.722492218017578, "learning_rate": 2.807961175957559e-05, "loss": 3.076, "step": 205500 }, { "epoch": 0.4959997977476553, "grad_norm": 17.147768020629883, "learning_rate": 2.801272910652802e-05, "loss": 3.0684, "step": 206000 }, { "epoch": 0.4972036807518972, "grad_norm": 15.113913536071777, "learning_rate": 2.794584645348044e-05, "loss": 3.0133, "step": 206500 }, { "epoch": 0.49840756375613904, "grad_norm": 15.339323043823242, "learning_rate": 2.7878963800432868e-05, "loss": 3.06, "step": 207000 }, { "epoch": 0.49961144676038094, "grad_norm": 14.279352188110352, "learning_rate": 2.7812214912691387e-05, "loss": 3.0718, "step": 207500 }, { "epoch": 0.5008153297646228, "grad_norm": 15.7473726272583, "learning_rate": 2.7745466024949905e-05, "loss": 3.0382, "step": 208000 }, { "epoch": 0.5020192127688647, "grad_norm": 16.69623374938965, "learning_rate": 2.7678583371902334e-05, "loss": 3.0469, "step": 208500 }, { "epoch": 0.5032230957731065, "grad_norm": 12.795482635498047, "learning_rate": 2.7611700718854753e-05, "loss": 3.0691, "step": 209000 }, { "epoch": 0.5044269787773484, "grad_norm": 15.719594955444336, "learning_rate": 2.7544818065807182e-05, "loss": 3.0843, "step": 209500 }, { "epoch": 0.5056308617815903, "grad_norm": 16.107906341552734, "learning_rate": 2.74780691780657e-05, "loss": 3.0939, "step": 210000 }, { "epoch": 0.5056308617815903, "eval_runtime": 6288.8164, "eval_samples_per_second": 132.083, "eval_steps_per_second": 33.021, "step": 210000 }, { "epoch": 0.5068347447858322, "grad_norm": 22.665922164916992, "learning_rate": 2.741118652501813e-05, "loss": 3.0311, "step": 210500 }, { "epoch": 0.5080386277900741, "grad_norm": 12.993492126464844, "learning_rate": 2.734430387197055e-05, "loss": 3.0409, "step": 211000 }, { "epoch": 0.509242510794316, "grad_norm": 13.392237663269043, "learning_rate": 2.727742121892298e-05, "loss": 3.0185, "step": 211500 }, { "epoch": 0.5104463937985578, "grad_norm": 18.179622650146484, "learning_rate": 2.7210538565875404e-05, "loss": 3.1036, "step": 212000 }, { "epoch": 0.5116502768027997, "grad_norm": 16.70694923400879, "learning_rate": 2.7143655912827826e-05, "loss": 3.063, "step": 212500 }, { "epoch": 0.5128541598070416, "grad_norm": 23.674760818481445, "learning_rate": 2.7076773259780252e-05, "loss": 3.0342, "step": 213000 }, { "epoch": 0.5140580428112835, "grad_norm": 19.409990310668945, "learning_rate": 2.701002437203877e-05, "loss": 3.0462, "step": 213500 }, { "epoch": 0.5152619258155254, "grad_norm": 15.574653625488281, "learning_rate": 2.69431417189912e-05, "loss": 3.0292, "step": 214000 }, { "epoch": 0.5164658088197672, "grad_norm": 17.644498825073242, "learning_rate": 2.6876259065943622e-05, "loss": 3.0152, "step": 214500 }, { "epoch": 0.5176696918240091, "grad_norm": 14.58530330657959, "learning_rate": 2.6809376412896048e-05, "loss": 3.1034, "step": 215000 }, { "epoch": 0.5176696918240091, "eval_runtime": 6223.0215, "eval_samples_per_second": 133.479, "eval_steps_per_second": 33.37, "step": 215000 }, { "epoch": 0.518873574828251, "grad_norm": 19.024547576904297, "learning_rate": 2.674249375984847e-05, "loss": 3.0733, "step": 215500 }, { "epoch": 0.5200774578324929, "grad_norm": 17.260374069213867, "learning_rate": 2.6675611106800896e-05, "loss": 3.0252, "step": 216000 }, { "epoch": 0.5212813408367348, "grad_norm": 18.4815673828125, "learning_rate": 2.6608862219059415e-05, "loss": 3.069, "step": 216500 }, { "epoch": 0.5224852238409767, "grad_norm": 15.065186500549316, "learning_rate": 2.6541979566011844e-05, "loss": 3.0697, "step": 217000 }, { "epoch": 0.5236891068452185, "grad_norm": 16.79564666748047, "learning_rate": 2.6475096912964263e-05, "loss": 3.0433, "step": 217500 }, { "epoch": 0.5248929898494604, "grad_norm": 18.250133514404297, "learning_rate": 2.6408214259916692e-05, "loss": 3.0243, "step": 218000 }, { "epoch": 0.5260968728537023, "grad_norm": 15.040393829345703, "learning_rate": 2.6341331606869118e-05, "loss": 3.0501, "step": 218500 }, { "epoch": 0.5273007558579442, "grad_norm": 18.00982093811035, "learning_rate": 2.627444895382154e-05, "loss": 3.0481, "step": 219000 }, { "epoch": 0.5285046388621861, "grad_norm": 14.428119659423828, "learning_rate": 2.6207566300773966e-05, "loss": 3.0788, "step": 219500 }, { "epoch": 0.529708521866428, "grad_norm": 19.191162109375, "learning_rate": 2.6140683647726395e-05, "loss": 3.0549, "step": 220000 }, { "epoch": 0.529708521866428, "eval_runtime": 6262.92, "eval_samples_per_second": 132.629, "eval_steps_per_second": 33.157, "step": 220000 }, { "epoch": 0.5309124048706698, "grad_norm": 18.9827938079834, "learning_rate": 2.6073934759984914e-05, "loss": 3.032, "step": 220500 }, { "epoch": 0.5321162878749117, "grad_norm": 16.249061584472656, "learning_rate": 2.6007052106937336e-05, "loss": 3.0587, "step": 221000 }, { "epoch": 0.5333201708791536, "grad_norm": 27.886228561401367, "learning_rate": 2.5940303219195855e-05, "loss": 3.0959, "step": 221500 }, { "epoch": 0.5345240538833955, "grad_norm": 28.477378845214844, "learning_rate": 2.587342056614828e-05, "loss": 3.0545, "step": 222000 }, { "epoch": 0.5357279368876374, "grad_norm": 54.090702056884766, "learning_rate": 2.580653791310071e-05, "loss": 3.0052, "step": 222500 }, { "epoch": 0.5369318198918792, "grad_norm": 20.456764221191406, "learning_rate": 2.5739655260053132e-05, "loss": 3.0362, "step": 223000 }, { "epoch": 0.5381357028961211, "grad_norm": 18.759544372558594, "learning_rate": 2.5672772607005558e-05, "loss": 3.0841, "step": 223500 }, { "epoch": 0.539339585900363, "grad_norm": 24.140661239624023, "learning_rate": 2.5605889953957983e-05, "loss": 3.0545, "step": 224000 }, { "epoch": 0.5405434689046049, "grad_norm": 15.08611011505127, "learning_rate": 2.5539007300910406e-05, "loss": 3.0784, "step": 224500 }, { "epoch": 0.5417473519088468, "grad_norm": 20.986557006835938, "learning_rate": 2.547212464786283e-05, "loss": 3.0682, "step": 225000 }, { "epoch": 0.5417473519088468, "eval_runtime": 6240.4652, "eval_samples_per_second": 133.106, "eval_steps_per_second": 33.277, "step": 225000 }, { "epoch": 0.5429512349130887, "grad_norm": 11.451869010925293, "learning_rate": 2.5405375760121354e-05, "loss": 3.0627, "step": 225500 }, { "epoch": 0.5441551179173305, "grad_norm": 17.614988327026367, "learning_rate": 2.5338626872379872e-05, "loss": 3.0518, "step": 226000 }, { "epoch": 0.5453590009215724, "grad_norm": 14.993136405944824, "learning_rate": 2.5271744219332298e-05, "loss": 3.0515, "step": 226500 }, { "epoch": 0.5465628839258143, "grad_norm": 21.78707504272461, "learning_rate": 2.520486156628472e-05, "loss": 3.0632, "step": 227000 }, { "epoch": 0.5477667669300562, "grad_norm": 16.39373207092285, "learning_rate": 2.513797891323715e-05, "loss": 3.0524, "step": 227500 }, { "epoch": 0.5489706499342981, "grad_norm": 13.787343978881836, "learning_rate": 2.5071230025495668e-05, "loss": 3.0449, "step": 228000 }, { "epoch": 0.55017453293854, "grad_norm": 19.658519744873047, "learning_rate": 2.5004347372448094e-05, "loss": 3.0304, "step": 228500 }, { "epoch": 0.5513784159427818, "grad_norm": 16.18865203857422, "learning_rate": 2.493746471940052e-05, "loss": 3.0746, "step": 229000 }, { "epoch": 0.5525822989470237, "grad_norm": 17.702472686767578, "learning_rate": 2.4870582066352942e-05, "loss": 3.07, "step": 229500 }, { "epoch": 0.5537861819512656, "grad_norm": 18.08761215209961, "learning_rate": 2.4803699413305368e-05, "loss": 3.0417, "step": 230000 }, { "epoch": 0.5537861819512656, "eval_runtime": 6192.5264, "eval_samples_per_second": 134.137, "eval_steps_per_second": 33.534, "step": 230000 }, { "epoch": 0.5549900649555075, "grad_norm": 12.940227508544922, "learning_rate": 2.473695052556389e-05, "loss": 3.0623, "step": 230500 }, { "epoch": 0.5561939479597494, "grad_norm": 14.184712409973145, "learning_rate": 2.4670067872516316e-05, "loss": 3.0565, "step": 231000 }, { "epoch": 0.5573978309639912, "grad_norm": 16.096614837646484, "learning_rate": 2.4603185219468738e-05, "loss": 2.9976, "step": 231500 }, { "epoch": 0.5586017139682331, "grad_norm": 15.835817337036133, "learning_rate": 2.4536302566421164e-05, "loss": 2.9842, "step": 232000 }, { "epoch": 0.559805596972475, "grad_norm": 22.432340621948242, "learning_rate": 2.446941991337359e-05, "loss": 3.0831, "step": 232500 }, { "epoch": 0.5610094799767169, "grad_norm": 19.895309448242188, "learning_rate": 2.4402537260326015e-05, "loss": 3.0444, "step": 233000 }, { "epoch": 0.5622133629809588, "grad_norm": 14.998634338378906, "learning_rate": 2.4335788372584534e-05, "loss": 3.0233, "step": 233500 }, { "epoch": 0.5634172459852007, "grad_norm": 12.780035972595215, "learning_rate": 2.426890571953696e-05, "loss": 3.0215, "step": 234000 }, { "epoch": 0.5646211289894425, "grad_norm": 18.854740142822266, "learning_rate": 2.4202023066489385e-05, "loss": 3.0684, "step": 234500 }, { "epoch": 0.5658250119936844, "grad_norm": 17.486467361450195, "learning_rate": 2.4135140413441808e-05, "loss": 3.053, "step": 235000 }, { "epoch": 0.5658250119936844, "eval_runtime": 6288.8357, "eval_samples_per_second": 132.082, "eval_steps_per_second": 33.021, "step": 235000 }, { "epoch": 0.5670288949979263, "grad_norm": 14.92556095123291, "learning_rate": 2.4068257760394233e-05, "loss": 3.0641, "step": 235500 }, { "epoch": 0.5682327780021682, "grad_norm": 13.280654907226562, "learning_rate": 2.400137510734666e-05, "loss": 3.0217, "step": 236000 }, { "epoch": 0.5694366610064101, "grad_norm": 16.9669246673584, "learning_rate": 2.393462621960518e-05, "loss": 3.0162, "step": 236500 }, { "epoch": 0.570640544010652, "grad_norm": 14.215867042541504, "learning_rate": 2.3867743566557604e-05, "loss": 3.0158, "step": 237000 }, { "epoch": 0.5718444270148938, "grad_norm": 19.857236862182617, "learning_rate": 2.380086091351003e-05, "loss": 3.0011, "step": 237500 }, { "epoch": 0.5730483100191357, "grad_norm": 14.70789909362793, "learning_rate": 2.3733978260462455e-05, "loss": 3.0155, "step": 238000 }, { "epoch": 0.5742521930233776, "grad_norm": 16.156538009643555, "learning_rate": 2.3667229372720977e-05, "loss": 3.0281, "step": 238500 }, { "epoch": 0.5754560760276195, "grad_norm": 29.431739807128906, "learning_rate": 2.36003467196734e-05, "loss": 3.0404, "step": 239000 }, { "epoch": 0.5766599590318614, "grad_norm": 14.224696159362793, "learning_rate": 2.3533464066625825e-05, "loss": 3.0172, "step": 239500 }, { "epoch": 0.5778638420361032, "grad_norm": 19.29595184326172, "learning_rate": 2.346658141357825e-05, "loss": 3.0622, "step": 240000 }, { "epoch": 0.5778638420361032, "eval_runtime": 6227.4429, "eval_samples_per_second": 133.385, "eval_steps_per_second": 33.346, "step": 240000 }, { "epoch": 0.5790677250403451, "grad_norm": 24.003347396850586, "learning_rate": 2.3399698760530677e-05, "loss": 2.9962, "step": 240500 }, { "epoch": 0.580271608044587, "grad_norm": 16.034706115722656, "learning_rate": 2.33328161074831e-05, "loss": 3.0286, "step": 241000 }, { "epoch": 0.5814754910488289, "grad_norm": 16.609622955322266, "learning_rate": 2.3265933454435525e-05, "loss": 3.031, "step": 241500 }, { "epoch": 0.5826793740530708, "grad_norm": 30.813108444213867, "learning_rate": 2.319905080138795e-05, "loss": 3.0143, "step": 242000 }, { "epoch": 0.5838832570573127, "grad_norm": 15.091474533081055, "learning_rate": 2.313230191364647e-05, "loss": 3.0475, "step": 242500 }, { "epoch": 0.5850871400615545, "grad_norm": 19.889976501464844, "learning_rate": 2.3065419260598895e-05, "loss": 3.0551, "step": 243000 }, { "epoch": 0.5862910230657964, "grad_norm": 16.42539405822754, "learning_rate": 2.299853660755132e-05, "loss": 2.9885, "step": 243500 }, { "epoch": 0.5874949060700383, "grad_norm": 18.250354766845703, "learning_rate": 2.2931653954503746e-05, "loss": 3.0267, "step": 244000 }, { "epoch": 0.5886987890742802, "grad_norm": 11.44227409362793, "learning_rate": 2.286477130145617e-05, "loss": 2.9568, "step": 244500 }, { "epoch": 0.5899026720785221, "grad_norm": 21.37769889831543, "learning_rate": 2.279802241371469e-05, "loss": 3.0259, "step": 245000 }, { "epoch": 0.5899026720785221, "eval_runtime": 6297.4297, "eval_samples_per_second": 131.902, "eval_steps_per_second": 32.976, "step": 245000 }, { "epoch": 0.591106555082764, "grad_norm": 15.137754440307617, "learning_rate": 2.273127352597321e-05, "loss": 3.0087, "step": 245500 }, { "epoch": 0.5923104380870058, "grad_norm": 15.59156608581543, "learning_rate": 2.2664390872925635e-05, "loss": 3.0397, "step": 246000 }, { "epoch": 0.5935143210912477, "grad_norm": 14.741199493408203, "learning_rate": 2.259750821987806e-05, "loss": 3.0505, "step": 246500 }, { "epoch": 0.5947182040954896, "grad_norm": 37.30345153808594, "learning_rate": 2.2530625566830483e-05, "loss": 3.0312, "step": 247000 }, { "epoch": 0.5959220870997315, "grad_norm": 16.39379119873047, "learning_rate": 2.2463742913782912e-05, "loss": 3.0068, "step": 247500 }, { "epoch": 0.5971259701039734, "grad_norm": 16.724523544311523, "learning_rate": 2.2396860260735335e-05, "loss": 3.0172, "step": 248000 }, { "epoch": 0.5983298531082153, "grad_norm": 13.491678237915039, "learning_rate": 2.2330111372993857e-05, "loss": 3.0396, "step": 248500 }, { "epoch": 0.5995337361124571, "grad_norm": 17.01793670654297, "learning_rate": 2.226322871994628e-05, "loss": 3.0092, "step": 249000 }, { "epoch": 0.600737619116699, "grad_norm": 16.2504825592041, "learning_rate": 2.219634606689871e-05, "loss": 3.0564, "step": 249500 }, { "epoch": 0.6019415021209409, "grad_norm": 19.381729125976562, "learning_rate": 2.212946341385113e-05, "loss": 2.9991, "step": 250000 }, { "epoch": 0.6019415021209409, "eval_runtime": 6343.8627, "eval_samples_per_second": 130.937, "eval_steps_per_second": 32.734, "step": 250000 }, { "epoch": 0.6031453851251828, "grad_norm": 15.789433479309082, "learning_rate": 2.2062714526109653e-05, "loss": 3.0164, "step": 250500 }, { "epoch": 0.6043492681294247, "grad_norm": 15.380681037902832, "learning_rate": 2.1995831873062075e-05, "loss": 3.006, "step": 251000 }, { "epoch": 0.6055531511336665, "grad_norm": 12.976866722106934, "learning_rate": 2.19289492200145e-05, "loss": 3.099, "step": 251500 }, { "epoch": 0.6067570341379084, "grad_norm": 17.682626724243164, "learning_rate": 2.1862066566966927e-05, "loss": 3.0381, "step": 252000 }, { "epoch": 0.6079609171421503, "grad_norm": 15.32071304321289, "learning_rate": 2.1795183913919352e-05, "loss": 3.0404, "step": 252500 }, { "epoch": 0.6091648001463922, "grad_norm": 21.887651443481445, "learning_rate": 2.1728301260871775e-05, "loss": 3.0282, "step": 253000 }, { "epoch": 0.6103686831506341, "grad_norm": 16.731210708618164, "learning_rate": 2.1661552373130297e-05, "loss": 3.0219, "step": 253500 }, { "epoch": 0.611572566154876, "grad_norm": 22.759746551513672, "learning_rate": 2.1594669720082722e-05, "loss": 3.0442, "step": 254000 }, { "epoch": 0.6127764491591178, "grad_norm": 18.68710708618164, "learning_rate": 2.1527787067035145e-05, "loss": 3.0091, "step": 254500 }, { "epoch": 0.6139803321633597, "grad_norm": 23.144712448120117, "learning_rate": 2.146090441398757e-05, "loss": 3.0501, "step": 255000 }, { "epoch": 0.6139803321633597, "eval_runtime": 6230.1182, "eval_samples_per_second": 133.327, "eval_steps_per_second": 33.332, "step": 255000 }, { "epoch": 0.6151842151676016, "grad_norm": 18.833757400512695, "learning_rate": 2.1394021760939996e-05, "loss": 3.1018, "step": 255500 }, { "epoch": 0.6163880981718435, "grad_norm": 21.688997268676758, "learning_rate": 2.132727287319852e-05, "loss": 3.0579, "step": 256000 }, { "epoch": 0.6175919811760854, "grad_norm": 17.346538543701172, "learning_rate": 2.126039022015094e-05, "loss": 3.0306, "step": 256500 }, { "epoch": 0.6187958641803273, "grad_norm": 18.86598014831543, "learning_rate": 2.1193507567103366e-05, "loss": 3.0237, "step": 257000 }, { "epoch": 0.6199997471845691, "grad_norm": 13.735309600830078, "learning_rate": 2.1126624914055792e-05, "loss": 3.0416, "step": 257500 }, { "epoch": 0.621203630188811, "grad_norm": 21.433256149291992, "learning_rate": 2.1059742261008218e-05, "loss": 3.0162, "step": 258000 }, { "epoch": 0.6224075131930529, "grad_norm": 18.01786231994629, "learning_rate": 2.099285960796064e-05, "loss": 3.0192, "step": 258500 }, { "epoch": 0.6236113961972948, "grad_norm": 17.93750762939453, "learning_rate": 2.092597695491307e-05, "loss": 3.0162, "step": 259000 }, { "epoch": 0.6248152792015367, "grad_norm": 19.375873565673828, "learning_rate": 2.0859094301865492e-05, "loss": 2.9953, "step": 259500 }, { "epoch": 0.6260191622057785, "grad_norm": 16.76817512512207, "learning_rate": 2.0792479179430107e-05, "loss": 2.9848, "step": 260000 }, { "epoch": 0.6260191622057785, "eval_runtime": 6319.8113, "eval_samples_per_second": 131.435, "eval_steps_per_second": 32.859, "step": 260000 }, { "epoch": 0.6272230452100204, "grad_norm": 19.69635009765625, "learning_rate": 2.0725596526382532e-05, "loss": 3.0555, "step": 260500 }, { "epoch": 0.6284269282142623, "grad_norm": 16.243324279785156, "learning_rate": 2.0658713873334955e-05, "loss": 3.0212, "step": 261000 }, { "epoch": 0.6296308112185042, "grad_norm": 17.867599487304688, "learning_rate": 2.0591831220287384e-05, "loss": 3.0451, "step": 261500 }, { "epoch": 0.6308346942227461, "grad_norm": 17.559730529785156, "learning_rate": 2.0525082332545903e-05, "loss": 3.012, "step": 262000 }, { "epoch": 0.632038577226988, "grad_norm": 14.618083953857422, "learning_rate": 2.045833344480442e-05, "loss": 3.034, "step": 262500 }, { "epoch": 0.6332424602312298, "grad_norm": 16.521699905395508, "learning_rate": 2.0391450791756847e-05, "loss": 3.0197, "step": 263000 }, { "epoch": 0.6344463432354717, "grad_norm": 16.326717376708984, "learning_rate": 2.0324568138709273e-05, "loss": 3.0566, "step": 263500 }, { "epoch": 0.6356502262397136, "grad_norm": 22.72909164428711, "learning_rate": 2.02576854856617e-05, "loss": 3.0413, "step": 264000 }, { "epoch": 0.6368541092439555, "grad_norm": 21.150442123413086, "learning_rate": 2.019080283261412e-05, "loss": 3.0337, "step": 264500 }, { "epoch": 0.6380579922481974, "grad_norm": 18.094627380371094, "learning_rate": 2.0123920179566547e-05, "loss": 3.0103, "step": 265000 }, { "epoch": 0.6380579922481974, "eval_runtime": 6283.8462, "eval_samples_per_second": 132.187, "eval_steps_per_second": 33.047, "step": 265000 }, { "epoch": 0.6392618752524393, "grad_norm": 16.778398513793945, "learning_rate": 2.0057037526518972e-05, "loss": 3.0193, "step": 265500 }, { "epoch": 0.6404657582566811, "grad_norm": 16.389066696166992, "learning_rate": 1.9990154873471398e-05, "loss": 3.0297, "step": 266000 }, { "epoch": 0.641669641260923, "grad_norm": 15.284423828125, "learning_rate": 1.9923405985729917e-05, "loss": 3.0253, "step": 266500 }, { "epoch": 0.6428735242651649, "grad_norm": 21.423006057739258, "learning_rate": 1.9856523332682343e-05, "loss": 3.0313, "step": 267000 }, { "epoch": 0.6440774072694068, "grad_norm": 17.86176109313965, "learning_rate": 1.9789640679634768e-05, "loss": 3.0644, "step": 267500 }, { "epoch": 0.6452812902736487, "grad_norm": 19.17348861694336, "learning_rate": 1.9722758026587194e-05, "loss": 3.0494, "step": 268000 }, { "epoch": 0.6464851732778905, "grad_norm": 19.088390350341797, "learning_rate": 1.9655875373539616e-05, "loss": 3.0172, "step": 268500 }, { "epoch": 0.6476890562821324, "grad_norm": 17.714704513549805, "learning_rate": 1.9588992720492046e-05, "loss": 3.0296, "step": 269000 }, { "epoch": 0.6488929392863743, "grad_norm": 16.175125122070312, "learning_rate": 1.9522110067444468e-05, "loss": 3.033, "step": 269500 }, { "epoch": 0.6500968222906162, "grad_norm": 13.180002212524414, "learning_rate": 1.9455227414396894e-05, "loss": 3.042, "step": 270000 }, { "epoch": 0.6500968222906162, "eval_runtime": 6276.2648, "eval_samples_per_second": 132.347, "eval_steps_per_second": 33.087, "step": 270000 }, { "epoch": 0.6513007052948581, "grad_norm": 19.098552703857422, "learning_rate": 1.9388478526655412e-05, "loss": 3.0693, "step": 270500 }, { "epoch": 0.6525045882991, "grad_norm": 17.581096649169922, "learning_rate": 1.9321595873607838e-05, "loss": 3.0159, "step": 271000 }, { "epoch": 0.6537084713033418, "grad_norm": 16.60484504699707, "learning_rate": 1.9254713220560264e-05, "loss": 3.0212, "step": 271500 }, { "epoch": 0.6549123543075837, "grad_norm": 16.275178909301758, "learning_rate": 1.918783056751269e-05, "loss": 3.0536, "step": 272000 }, { "epoch": 0.6561162373118256, "grad_norm": 18.09239959716797, "learning_rate": 1.9121081679771208e-05, "loss": 3.0576, "step": 272500 }, { "epoch": 0.6573201203160675, "grad_norm": 17.817174911499023, "learning_rate": 1.9054199026723634e-05, "loss": 3.006, "step": 273000 }, { "epoch": 0.6585240033203094, "grad_norm": 20.33548355102539, "learning_rate": 1.8987450138982156e-05, "loss": 3.0236, "step": 273500 }, { "epoch": 0.6597278863245513, "grad_norm": 16.80567169189453, "learning_rate": 1.892056748593458e-05, "loss": 3.0272, "step": 274000 }, { "epoch": 0.660931769328793, "grad_norm": 14.377747535705566, "learning_rate": 1.8853684832887004e-05, "loss": 3.0447, "step": 274500 }, { "epoch": 0.662135652333035, "grad_norm": 20.724485397338867, "learning_rate": 1.878680217983943e-05, "loss": 3.0422, "step": 275000 }, { "epoch": 0.662135652333035, "eval_runtime": 6186.2818, "eval_samples_per_second": 134.272, "eval_steps_per_second": 33.568, "step": 275000 }, { "epoch": 0.6633395353372769, "grad_norm": 18.72093963623047, "learning_rate": 1.8719919526791856e-05, "loss": 3.0455, "step": 275500 }, { "epoch": 0.6645434183415188, "grad_norm": 20.733427047729492, "learning_rate": 1.8653170639050374e-05, "loss": 3.0217, "step": 276000 }, { "epoch": 0.6657473013457607, "grad_norm": 20.21004295349121, "learning_rate": 1.85862879860028e-05, "loss": 3.0201, "step": 276500 }, { "epoch": 0.6669511843500024, "grad_norm": 16.68962860107422, "learning_rate": 1.8519405332955226e-05, "loss": 3.0333, "step": 277000 }, { "epoch": 0.6681550673542443, "grad_norm": 16.575241088867188, "learning_rate": 1.8452522679907648e-05, "loss": 3.018, "step": 277500 }, { "epoch": 0.6693589503584862, "grad_norm": 19.38899803161621, "learning_rate": 1.8385640026860074e-05, "loss": 3.0496, "step": 278000 }, { "epoch": 0.6705628333627282, "grad_norm": 14.967867851257324, "learning_rate": 1.831902490442469e-05, "loss": 2.999, "step": 278500 }, { "epoch": 0.67176671636697, "grad_norm": 22.434553146362305, "learning_rate": 1.8252142251377114e-05, "loss": 3.0349, "step": 279000 }, { "epoch": 0.672970599371212, "grad_norm": 16.710906982421875, "learning_rate": 1.818525959832954e-05, "loss": 3.0342, "step": 279500 }, { "epoch": 0.6741744823754537, "grad_norm": 15.848820686340332, "learning_rate": 1.8118376945281966e-05, "loss": 3.0272, "step": 280000 }, { "epoch": 0.6741744823754537, "eval_runtime": 6353.5428, "eval_samples_per_second": 130.737, "eval_steps_per_second": 32.684, "step": 280000 }, { "epoch": 0.6753783653796956, "grad_norm": 15.844106674194336, "learning_rate": 1.805149429223439e-05, "loss": 3.0116, "step": 280500 }, { "epoch": 0.6765822483839375, "grad_norm": 19.46364402770996, "learning_rate": 1.7984611639186817e-05, "loss": 3.0428, "step": 281000 }, { "epoch": 0.6777861313881794, "grad_norm": 16.986345291137695, "learning_rate": 1.791772898613924e-05, "loss": 3.0407, "step": 281500 }, { "epoch": 0.6789900143924213, "grad_norm": 19.00211524963379, "learning_rate": 1.7850846333091666e-05, "loss": 3.0754, "step": 282000 }, { "epoch": 0.6801938973966632, "grad_norm": 16.347320556640625, "learning_rate": 1.778396368004409e-05, "loss": 3.0583, "step": 282500 }, { "epoch": 0.681397780400905, "grad_norm": 17.984121322631836, "learning_rate": 1.7717081026996517e-05, "loss": 3.0078, "step": 283000 }, { "epoch": 0.6826016634051469, "grad_norm": 13.47775936126709, "learning_rate": 1.765019837394894e-05, "loss": 3.0313, "step": 283500 }, { "epoch": 0.6838055464093888, "grad_norm": 19.955591201782227, "learning_rate": 1.7583449486207458e-05, "loss": 3.0128, "step": 284000 }, { "epoch": 0.6850094294136307, "grad_norm": 15.306801795959473, "learning_rate": 1.7516566833159887e-05, "loss": 3.0537, "step": 284500 }, { "epoch": 0.6862133124178726, "grad_norm": 18.41864013671875, "learning_rate": 1.744968418011231e-05, "loss": 2.9884, "step": 285000 }, { "epoch": 0.6862133124178726, "eval_runtime": 6358.5857, "eval_samples_per_second": 130.634, "eval_steps_per_second": 32.659, "step": 285000 }, { "epoch": 0.6874171954221144, "grad_norm": 23.076107025146484, "learning_rate": 1.7382801527064735e-05, "loss": 3.0266, "step": 285500 }, { "epoch": 0.6886210784263563, "grad_norm": 13.705315589904785, "learning_rate": 1.7315918874017158e-05, "loss": 3.0475, "step": 286000 }, { "epoch": 0.6898249614305982, "grad_norm": 16.31940460205078, "learning_rate": 1.7249036220969587e-05, "loss": 2.9996, "step": 286500 }, { "epoch": 0.6910288444348401, "grad_norm": 18.389102935791016, "learning_rate": 1.718215356792201e-05, "loss": 3.0546, "step": 287000 }, { "epoch": 0.692232727439082, "grad_norm": 13.655202865600586, "learning_rate": 1.711540468018053e-05, "loss": 3.0324, "step": 287500 }, { "epoch": 0.6934366104433239, "grad_norm": 16.57909393310547, "learning_rate": 1.7048522027132954e-05, "loss": 3.0293, "step": 288000 }, { "epoch": 0.6946404934475657, "grad_norm": 20.497554779052734, "learning_rate": 1.6981639374085383e-05, "loss": 3.0236, "step": 288500 }, { "epoch": 0.6958443764518076, "grad_norm": 18.09133529663086, "learning_rate": 1.6914756721037805e-05, "loss": 3.0379, "step": 289000 }, { "epoch": 0.6970482594560495, "grad_norm": 26.225669860839844, "learning_rate": 1.684787406799023e-05, "loss": 3.0053, "step": 289500 }, { "epoch": 0.6982521424602914, "grad_norm": 17.222896575927734, "learning_rate": 1.6780991414942657e-05, "loss": 2.9939, "step": 290000 }, { "epoch": 0.6982521424602914, "eval_runtime": 6305.6016, "eval_samples_per_second": 131.731, "eval_steps_per_second": 32.933, "step": 290000 }, { "epoch": 0.6994560254645333, "grad_norm": 13.189409255981445, "learning_rate": 1.6714108761895082e-05, "loss": 3.0342, "step": 290500 }, { "epoch": 0.7006599084687752, "grad_norm": 16.97842025756836, "learning_rate": 1.6647226108847505e-05, "loss": 3.046, "step": 291000 }, { "epoch": 0.701863791473017, "grad_norm": 22.634611129760742, "learning_rate": 1.658061098641212e-05, "loss": 3.0375, "step": 291500 }, { "epoch": 0.7030676744772589, "grad_norm": 18.193796157836914, "learning_rate": 1.6513728333364545e-05, "loss": 3.0379, "step": 292000 }, { "epoch": 0.7042715574815008, "grad_norm": 18.391408920288086, "learning_rate": 1.644684568031697e-05, "loss": 2.9838, "step": 292500 }, { "epoch": 0.7054754404857427, "grad_norm": 20.497100830078125, "learning_rate": 1.6380096792575493e-05, "loss": 3.0761, "step": 293000 }, { "epoch": 0.7066793234899846, "grad_norm": 18.94228744506836, "learning_rate": 1.6313214139527915e-05, "loss": 3.0614, "step": 293500 }, { "epoch": 0.7078832064942264, "grad_norm": 15.402490615844727, "learning_rate": 1.624633148648034e-05, "loss": 3.0053, "step": 294000 }, { "epoch": 0.7090870894984683, "grad_norm": 26.502038955688477, "learning_rate": 1.6179448833432767e-05, "loss": 3.0216, "step": 294500 }, { "epoch": 0.7102909725027102, "grad_norm": 20.452205657958984, "learning_rate": 1.6112566180385193e-05, "loss": 2.9757, "step": 295000 }, { "epoch": 0.7102909725027102, "eval_runtime": 6348.6102, "eval_samples_per_second": 130.839, "eval_steps_per_second": 32.71, "step": 295000 }, { "epoch": 0.7114948555069521, "grad_norm": 104.5809097290039, "learning_rate": 1.6045683527337615e-05, "loss": 3.0088, "step": 295500 }, { "epoch": 0.712698738511194, "grad_norm": 15.921069145202637, "learning_rate": 1.597880087429004e-05, "loss": 3.0624, "step": 296000 }, { "epoch": 0.7139026215154359, "grad_norm": 11.739727020263672, "learning_rate": 1.5911918221242467e-05, "loss": 3.0515, "step": 296500 }, { "epoch": 0.7151065045196777, "grad_norm": 15.340862274169922, "learning_rate": 1.5845169333500985e-05, "loss": 3.0208, "step": 297000 }, { "epoch": 0.7163103875239196, "grad_norm": 16.77552604675293, "learning_rate": 1.577828668045341e-05, "loss": 3.0112, "step": 297500 }, { "epoch": 0.7175142705281615, "grad_norm": 19.09606170654297, "learning_rate": 1.5711404027405837e-05, "loss": 3.0038, "step": 298000 }, { "epoch": 0.7187181535324034, "grad_norm": 12.892488479614258, "learning_rate": 1.5644521374358262e-05, "loss": 3.0353, "step": 298500 }, { "epoch": 0.7199220365366453, "grad_norm": 15.720181465148926, "learning_rate": 1.5577638721310685e-05, "loss": 3.003, "step": 299000 }, { "epoch": 0.7211259195408872, "grad_norm": 16.5432186126709, "learning_rate": 1.5510756068263114e-05, "loss": 3.0594, "step": 299500 }, { "epoch": 0.722329802545129, "grad_norm": 24.2777042388916, "learning_rate": 1.5443873415215536e-05, "loss": 3.0239, "step": 300000 }, { "epoch": 0.722329802545129, "eval_runtime": 6203.3821, "eval_samples_per_second": 133.902, "eval_steps_per_second": 33.476, "step": 300000 }, { "epoch": 0.7235336855493709, "grad_norm": 14.297070503234863, "learning_rate": 1.5376990762167962e-05, "loss": 3.0123, "step": 300500 }, { "epoch": 0.7247375685536128, "grad_norm": 18.216154098510742, "learning_rate": 1.5310108109120384e-05, "loss": 2.9833, "step": 301000 }, { "epoch": 0.7259414515578547, "grad_norm": 15.619494438171387, "learning_rate": 1.5243359221378908e-05, "loss": 3.0715, "step": 301500 }, { "epoch": 0.7271453345620966, "grad_norm": 22.748498916625977, "learning_rate": 1.5176476568331332e-05, "loss": 3.0101, "step": 302000 }, { "epoch": 0.7283492175663384, "grad_norm": 16.824371337890625, "learning_rate": 1.5109593915283756e-05, "loss": 3.0347, "step": 302500 }, { "epoch": 0.7295531005705803, "grad_norm": 15.611109733581543, "learning_rate": 1.504271126223618e-05, "loss": 3.0386, "step": 303000 }, { "epoch": 0.7307569835748222, "grad_norm": 17.015262603759766, "learning_rate": 1.4975962374494704e-05, "loss": 3.0148, "step": 303500 }, { "epoch": 0.7319608665790641, "grad_norm": 18.96904945373535, "learning_rate": 1.4909079721447128e-05, "loss": 3.1005, "step": 304000 }, { "epoch": 0.733164749583306, "grad_norm": 21.718101501464844, "learning_rate": 1.4842197068399552e-05, "loss": 3.0489, "step": 304500 }, { "epoch": 0.7343686325875479, "grad_norm": 14.246601104736328, "learning_rate": 1.4775314415351976e-05, "loss": 3.0439, "step": 305000 }, { "epoch": 0.7343686325875479, "eval_runtime": 6028.9355, "eval_samples_per_second": 137.776, "eval_steps_per_second": 34.444, "step": 305000 }, { "epoch": 0.7355725155917897, "grad_norm": 16.374101638793945, "learning_rate": 1.4708431762304404e-05, "loss": 2.9875, "step": 305500 }, { "epoch": 0.7367763985960316, "grad_norm": 21.80797004699707, "learning_rate": 1.4641549109256828e-05, "loss": 3.0288, "step": 306000 }, { "epoch": 0.7379802816002735, "grad_norm": 14.981256484985352, "learning_rate": 1.4574666456209252e-05, "loss": 3.0079, "step": 306500 }, { "epoch": 0.7391841646045154, "grad_norm": 15.336825370788574, "learning_rate": 1.4507783803161679e-05, "loss": 3.0317, "step": 307000 }, { "epoch": 0.7403880476087573, "grad_norm": 16.014474868774414, "learning_rate": 1.4440901150114103e-05, "loss": 3.0247, "step": 307500 }, { "epoch": 0.7415919306129992, "grad_norm": 14.997090339660645, "learning_rate": 1.4374152262372622e-05, "loss": 3.0177, "step": 308000 }, { "epoch": 0.742795813617241, "grad_norm": 17.185972213745117, "learning_rate": 1.4307269609325048e-05, "loss": 3.021, "step": 308500 }, { "epoch": 0.7439996966214829, "grad_norm": 14.902591705322266, "learning_rate": 1.4240386956277473e-05, "loss": 3.0291, "step": 309000 }, { "epoch": 0.7452035796257248, "grad_norm": 17.680278778076172, "learning_rate": 1.4173504303229897e-05, "loss": 3.0205, "step": 309500 }, { "epoch": 0.7464074626299667, "grad_norm": 18.492225646972656, "learning_rate": 1.4106621650182321e-05, "loss": 3.012, "step": 310000 }, { "epoch": 0.7464074626299667, "eval_runtime": 6370.9929, "eval_samples_per_second": 130.379, "eval_steps_per_second": 32.595, "step": 310000 }, { "epoch": 0.7476113456342086, "grad_norm": 18.544729232788086, "learning_rate": 1.4039872762440842e-05, "loss": 2.9965, "step": 310500 }, { "epoch": 0.7488152286384504, "grad_norm": 19.649858474731445, "learning_rate": 1.397299010939327e-05, "loss": 3.0335, "step": 311000 }, { "epoch": 0.7500191116426923, "grad_norm": 19.35677146911621, "learning_rate": 1.3906107456345693e-05, "loss": 3.0426, "step": 311500 }, { "epoch": 0.7512229946469342, "grad_norm": 19.635725021362305, "learning_rate": 1.3839224803298117e-05, "loss": 3.0506, "step": 312000 }, { "epoch": 0.7524268776511761, "grad_norm": 16.11264991760254, "learning_rate": 1.3772342150250541e-05, "loss": 3.0185, "step": 312500 }, { "epoch": 0.753630760655418, "grad_norm": 16.436038970947266, "learning_rate": 1.3705593262509065e-05, "loss": 2.9902, "step": 313000 }, { "epoch": 0.7548346436596599, "grad_norm": 15.412540435791016, "learning_rate": 1.363871060946149e-05, "loss": 2.987, "step": 313500 }, { "epoch": 0.7560385266639017, "grad_norm": 15.1536283493042, "learning_rate": 1.3571827956413913e-05, "loss": 2.9802, "step": 314000 }, { "epoch": 0.7572424096681436, "grad_norm": 12.424234390258789, "learning_rate": 1.3504945303366337e-05, "loss": 3.0389, "step": 314500 }, { "epoch": 0.7584462926723855, "grad_norm": 18.4250431060791, "learning_rate": 1.3438062650318765e-05, "loss": 3.0125, "step": 315000 }, { "epoch": 0.7584462926723855, "eval_runtime": 6375.7867, "eval_samples_per_second": 130.281, "eval_steps_per_second": 32.57, "step": 315000 }, { "epoch": 0.7596501756766274, "grad_norm": 16.10649299621582, "learning_rate": 1.3371313762577283e-05, "loss": 2.9806, "step": 315500 }, { "epoch": 0.7608540586808693, "grad_norm": 20.46068572998047, "learning_rate": 1.3304431109529707e-05, "loss": 3.0044, "step": 316000 }, { "epoch": 0.7620579416851112, "grad_norm": 13.980119705200195, "learning_rate": 1.3237548456482131e-05, "loss": 3.0349, "step": 316500 }, { "epoch": 0.763261824689353, "grad_norm": 14.805524826049805, "learning_rate": 1.3170665803434559e-05, "loss": 3.0352, "step": 317000 }, { "epoch": 0.7644657076935949, "grad_norm": 17.586395263671875, "learning_rate": 1.3103783150386983e-05, "loss": 3.0501, "step": 317500 }, { "epoch": 0.7656695906978368, "grad_norm": 17.75722312927246, "learning_rate": 1.3036900497339407e-05, "loss": 3.0598, "step": 318000 }, { "epoch": 0.7668734737020787, "grad_norm": 22.714632034301758, "learning_rate": 1.2970017844291834e-05, "loss": 3.0555, "step": 318500 }, { "epoch": 0.7680773567063206, "grad_norm": 13.692117691040039, "learning_rate": 1.2903268956550355e-05, "loss": 3.0004, "step": 319000 }, { "epoch": 0.7692812397105625, "grad_norm": 15.780096054077148, "learning_rate": 1.2836386303502779e-05, "loss": 3.0278, "step": 319500 }, { "epoch": 0.7704851227148043, "grad_norm": 22.532176971435547, "learning_rate": 1.2769503650455203e-05, "loss": 3.045, "step": 320000 }, { "epoch": 0.7704851227148043, "eval_runtime": 6361.4309, "eval_samples_per_second": 130.575, "eval_steps_per_second": 32.644, "step": 320000 }, { "epoch": 0.7716890057190462, "grad_norm": 16.199644088745117, "learning_rate": 1.270262099740763e-05, "loss": 3.03, "step": 320500 }, { "epoch": 0.7728928887232881, "grad_norm": 23.411863327026367, "learning_rate": 1.2635738344360054e-05, "loss": 3.0227, "step": 321000 }, { "epoch": 0.77409677172753, "grad_norm": 14.578089714050293, "learning_rate": 1.2568989456618575e-05, "loss": 3.0099, "step": 321500 }, { "epoch": 0.7753006547317719, "grad_norm": 22.472322463989258, "learning_rate": 1.2502106803570999e-05, "loss": 3.0347, "step": 322000 }, { "epoch": 0.7765045377360137, "grad_norm": 12.440498352050781, "learning_rate": 1.2435224150523425e-05, "loss": 2.9987, "step": 322500 }, { "epoch": 0.7777084207402556, "grad_norm": 20.633949279785156, "learning_rate": 1.2368341497475849e-05, "loss": 3.0421, "step": 323000 }, { "epoch": 0.7789123037444975, "grad_norm": 17.52497673034668, "learning_rate": 1.2301458844428274e-05, "loss": 3.0747, "step": 323500 }, { "epoch": 0.7801161867487394, "grad_norm": 19.617210388183594, "learning_rate": 1.2234576191380698e-05, "loss": 2.9955, "step": 324000 }, { "epoch": 0.7813200697529813, "grad_norm": 16.269994735717773, "learning_rate": 1.2167827303639219e-05, "loss": 2.94, "step": 324500 }, { "epoch": 0.7825239527572232, "grad_norm": 13.604962348937988, "learning_rate": 1.2100944650591644e-05, "loss": 3.0736, "step": 325000 }, { "epoch": 0.7825239527572232, "eval_runtime": 6376.4381, "eval_samples_per_second": 130.268, "eval_steps_per_second": 32.567, "step": 325000 }, { "epoch": 0.783727835761465, "grad_norm": 20.704360961914062, "learning_rate": 1.203406199754407e-05, "loss": 3.0536, "step": 325500 }, { "epoch": 0.7849317187657069, "grad_norm": 14.824162483215332, "learning_rate": 1.1967179344496494e-05, "loss": 3.0263, "step": 326000 }, { "epoch": 0.7861356017699488, "grad_norm": 16.627286911010742, "learning_rate": 1.190029669144892e-05, "loss": 3.0037, "step": 326500 }, { "epoch": 0.7873394847741907, "grad_norm": 13.925793647766113, "learning_rate": 1.183354780370744e-05, "loss": 3.0127, "step": 327000 }, { "epoch": 0.7885433677784326, "grad_norm": 19.544754028320312, "learning_rate": 1.1766665150659866e-05, "loss": 3.0307, "step": 327500 }, { "epoch": 0.7897472507826745, "grad_norm": 13.963886260986328, "learning_rate": 1.169978249761229e-05, "loss": 3.034, "step": 328000 }, { "epoch": 0.7909511337869163, "grad_norm": 17.435409545898438, "learning_rate": 1.1632899844564716e-05, "loss": 3.0295, "step": 328500 }, { "epoch": 0.7921550167911582, "grad_norm": 17.950336456298828, "learning_rate": 1.156601719151714e-05, "loss": 3.0332, "step": 329000 }, { "epoch": 0.7933588997954001, "grad_norm": 18.523168563842773, "learning_rate": 1.1499134538469566e-05, "loss": 3.0235, "step": 329500 }, { "epoch": 0.794562782799642, "grad_norm": 14.469148635864258, "learning_rate": 1.143225188542199e-05, "loss": 3.0022, "step": 330000 }, { "epoch": 0.794562782799642, "eval_runtime": 6383.675, "eval_samples_per_second": 130.12, "eval_steps_per_second": 32.53, "step": 330000 }, { "epoch": 0.7957666658038839, "grad_norm": 17.111066818237305, "learning_rate": 1.136550299768051e-05, "loss": 3.0552, "step": 330500 }, { "epoch": 0.7969705488081257, "grad_norm": 15.104440689086914, "learning_rate": 1.1298620344632934e-05, "loss": 3.0274, "step": 331000 }, { "epoch": 0.7981744318123676, "grad_norm": 16.809152603149414, "learning_rate": 1.123173769158536e-05, "loss": 3.0156, "step": 331500 }, { "epoch": 0.7993783148166095, "grad_norm": 16.31627655029297, "learning_rate": 1.1164855038537784e-05, "loss": 3.0302, "step": 332000 }, { "epoch": 0.8005821978208514, "grad_norm": 14.074172019958496, "learning_rate": 1.109797238549021e-05, "loss": 3.0415, "step": 332500 }, { "epoch": 0.8017860808250933, "grad_norm": 26.245460510253906, "learning_rate": 1.1031089732442635e-05, "loss": 3.0031, "step": 333000 }, { "epoch": 0.8029899638293352, "grad_norm": 30.44843864440918, "learning_rate": 1.0964340844701156e-05, "loss": 3.0017, "step": 333500 }, { "epoch": 0.804193846833577, "grad_norm": 17.4643611907959, "learning_rate": 1.0897458191653582e-05, "loss": 3.0633, "step": 334000 }, { "epoch": 0.8053977298378189, "grad_norm": 31.82565689086914, "learning_rate": 1.0830575538606006e-05, "loss": 3.043, "step": 334500 }, { "epoch": 0.8066016128420608, "grad_norm": 17.253402709960938, "learning_rate": 1.0763692885558431e-05, "loss": 3.0325, "step": 335000 }, { "epoch": 0.8066016128420608, "eval_runtime": 6315.4758, "eval_samples_per_second": 131.525, "eval_steps_per_second": 32.881, "step": 335000 }, { "epoch": 0.8078054958463027, "grad_norm": 22.236631393432617, "learning_rate": 1.0696810232510855e-05, "loss": 3.0358, "step": 335500 }, { "epoch": 0.8090093788505446, "grad_norm": 14.467453956604004, "learning_rate": 1.0629927579463281e-05, "loss": 2.9967, "step": 336000 }, { "epoch": 0.8102132618547865, "grad_norm": 23.571836471557617, "learning_rate": 1.0563044926415705e-05, "loss": 3.0579, "step": 336500 }, { "epoch": 0.8114171448590283, "grad_norm": 19.492727279663086, "learning_rate": 1.0496162273368131e-05, "loss": 3.0471, "step": 337000 }, { "epoch": 0.8126210278632702, "grad_norm": 14.599898338317871, "learning_rate": 1.0429413385626651e-05, "loss": 3.0066, "step": 337500 }, { "epoch": 0.8138249108675121, "grad_norm": 17.604732513427734, "learning_rate": 1.0362530732579075e-05, "loss": 3.0106, "step": 338000 }, { "epoch": 0.815028793871754, "grad_norm": 15.079025268554688, "learning_rate": 1.0295781844837596e-05, "loss": 3.006, "step": 338500 }, { "epoch": 0.8162326768759959, "grad_norm": 17.019149780273438, "learning_rate": 1.0228899191790021e-05, "loss": 3.0254, "step": 339000 }, { "epoch": 0.8174365598802377, "grad_norm": 15.817625045776367, "learning_rate": 1.0162016538742445e-05, "loss": 3.002, "step": 339500 }, { "epoch": 0.8186404428844796, "grad_norm": 13.755847930908203, "learning_rate": 1.0095133885694871e-05, "loss": 3.0058, "step": 340000 }, { "epoch": 0.8186404428844796, "eval_runtime": 6241.8468, "eval_samples_per_second": 133.077, "eval_steps_per_second": 33.269, "step": 340000 }, { "epoch": 0.8198443258887215, "grad_norm": 16.21925926208496, "learning_rate": 1.0028251232647295e-05, "loss": 3.0572, "step": 340500 }, { "epoch": 0.8210482088929634, "grad_norm": 17.245609283447266, "learning_rate": 9.961502344905817e-06, "loss": 3.0659, "step": 341000 }, { "epoch": 0.8222520918972053, "grad_norm": 17.12338638305664, "learning_rate": 9.894619691858241e-06, "loss": 3.0002, "step": 341500 }, { "epoch": 0.8234559749014472, "grad_norm": 13.26212215423584, "learning_rate": 9.827737038810667e-06, "loss": 2.9828, "step": 342000 }, { "epoch": 0.824659857905689, "grad_norm": 20.169322967529297, "learning_rate": 9.760854385763091e-06, "loss": 2.9912, "step": 342500 }, { "epoch": 0.8258637409099309, "grad_norm": 18.99537467956543, "learning_rate": 9.693971732715517e-06, "loss": 3.0485, "step": 343000 }, { "epoch": 0.8270676239141728, "grad_norm": 27.021839141845703, "learning_rate": 9.627089079667943e-06, "loss": 3.029, "step": 343500 }, { "epoch": 0.8282715069184147, "grad_norm": 21.197938919067383, "learning_rate": 9.560206426620367e-06, "loss": 3.058, "step": 344000 }, { "epoch": 0.8294753899226566, "grad_norm": 15.80473518371582, "learning_rate": 9.493457538878885e-06, "loss": 3.0378, "step": 344500 }, { "epoch": 0.8306792729268985, "grad_norm": 20.992782592773438, "learning_rate": 9.426574885831311e-06, "loss": 3.042, "step": 345000 }, { "epoch": 0.8306792729268985, "eval_runtime": 6237.488, "eval_samples_per_second": 133.17, "eval_steps_per_second": 33.293, "step": 345000 }, { "epoch": 0.8318831559311403, "grad_norm": 15.700128555297852, "learning_rate": 9.359692232783737e-06, "loss": 3.007, "step": 345500 }, { "epoch": 0.8330870389353822, "grad_norm": 15.391378402709961, "learning_rate": 9.292809579736161e-06, "loss": 3.0211, "step": 346000 }, { "epoch": 0.8342909219396241, "grad_norm": 17.32360076904297, "learning_rate": 9.225926926688587e-06, "loss": 3.0727, "step": 346500 }, { "epoch": 0.835494804943866, "grad_norm": 15.85698127746582, "learning_rate": 9.159178038947107e-06, "loss": 3.0066, "step": 347000 }, { "epoch": 0.8366986879481079, "grad_norm": 15.092347145080566, "learning_rate": 9.092295385899533e-06, "loss": 3.0106, "step": 347500 }, { "epoch": 0.8379025709523497, "grad_norm": 14.47977352142334, "learning_rate": 9.025412732851957e-06, "loss": 3.0139, "step": 348000 }, { "epoch": 0.8391064539565916, "grad_norm": 12.257486343383789, "learning_rate": 8.958530079804383e-06, "loss": 3.0264, "step": 348500 }, { "epoch": 0.8403103369608335, "grad_norm": 17.00981330871582, "learning_rate": 8.891781192062903e-06, "loss": 3.0321, "step": 349000 }, { "epoch": 0.8415142199650754, "grad_norm": 17.08600616455078, "learning_rate": 8.824898539015327e-06, "loss": 3.0046, "step": 349500 }, { "epoch": 0.8427181029693173, "grad_norm": 14.907938003540039, "learning_rate": 8.758015885967753e-06, "loss": 3.0485, "step": 350000 }, { "epoch": 0.8427181029693173, "eval_runtime": 6388.9903, "eval_samples_per_second": 130.012, "eval_steps_per_second": 32.503, "step": 350000 }, { "epoch": 0.8439219859735592, "grad_norm": 14.369677543640137, "learning_rate": 8.691133232920177e-06, "loss": 3.0205, "step": 350500 }, { "epoch": 0.845125868977801, "grad_norm": 19.901779174804688, "learning_rate": 8.624250579872602e-06, "loss": 3.0481, "step": 351000 }, { "epoch": 0.8463297519820429, "grad_norm": 14.823498725891113, "learning_rate": 8.557367926825027e-06, "loss": 2.9577, "step": 351500 }, { "epoch": 0.8475336349862848, "grad_norm": 19.70775032043457, "learning_rate": 8.490485273777452e-06, "loss": 3.0341, "step": 352000 }, { "epoch": 0.8487375179905267, "grad_norm": 17.01579475402832, "learning_rate": 8.423736386035973e-06, "loss": 2.9874, "step": 352500 }, { "epoch": 0.8499414009947686, "grad_norm": 16.942848205566406, "learning_rate": 8.356853732988397e-06, "loss": 3.0226, "step": 353000 }, { "epoch": 0.8511452839990105, "grad_norm": 16.905664443969727, "learning_rate": 8.289971079940822e-06, "loss": 2.9484, "step": 353500 }, { "epoch": 0.8523491670032523, "grad_norm": 15.149470329284668, "learning_rate": 8.223088426893248e-06, "loss": 2.9945, "step": 354000 }, { "epoch": 0.8535530500074942, "grad_norm": 21.70083236694336, "learning_rate": 8.156205773845672e-06, "loss": 3.0103, "step": 354500 }, { "epoch": 0.854756933011736, "grad_norm": 12.760059356689453, "learning_rate": 8.089323120798098e-06, "loss": 3.0178, "step": 355000 }, { "epoch": 0.854756933011736, "eval_runtime": 6293.9927, "eval_samples_per_second": 131.974, "eval_steps_per_second": 32.994, "step": 355000 }, { "epoch": 0.855960816015978, "grad_norm": 28.85261344909668, "learning_rate": 8.022440467750522e-06, "loss": 2.9688, "step": 355500 }, { "epoch": 0.8571646990202199, "grad_norm": 13.942831039428711, "learning_rate": 7.955557814702948e-06, "loss": 2.9447, "step": 356000 }, { "epoch": 0.8583685820244616, "grad_norm": 14.091262817382812, "learning_rate": 7.888808926961468e-06, "loss": 2.9865, "step": 356500 }, { "epoch": 0.8595724650287035, "grad_norm": 19.63146209716797, "learning_rate": 7.821926273913894e-06, "loss": 3.0031, "step": 357000 }, { "epoch": 0.8607763480329454, "grad_norm": 12.868454933166504, "learning_rate": 7.755043620866318e-06, "loss": 2.9701, "step": 357500 }, { "epoch": 0.8619802310371873, "grad_norm": 18.4489803314209, "learning_rate": 7.688160967818744e-06, "loss": 2.9628, "step": 358000 }, { "epoch": 0.8631841140414293, "grad_norm": 14.441180229187012, "learning_rate": 7.6212783147711685e-06, "loss": 3.0001, "step": 358500 }, { "epoch": 0.8643879970456712, "grad_norm": 14.59991455078125, "learning_rate": 7.554529427029688e-06, "loss": 3.0118, "step": 359000 }, { "epoch": 0.8655918800499129, "grad_norm": 24.200435638427734, "learning_rate": 7.487646773982113e-06, "loss": 3.0567, "step": 359500 }, { "epoch": 0.8667957630541548, "grad_norm": 17.150327682495117, "learning_rate": 7.420764120934539e-06, "loss": 3.0472, "step": 360000 }, { "epoch": 0.8667957630541548, "eval_runtime": 6262.3633, "eval_samples_per_second": 132.641, "eval_steps_per_second": 33.16, "step": 360000 }, { "epoch": 0.8679996460583967, "grad_norm": 20.363269805908203, "learning_rate": 7.353881467886964e-06, "loss": 2.9548, "step": 360500 }, { "epoch": 0.8692035290626386, "grad_norm": 16.118206024169922, "learning_rate": 7.2869988148393885e-06, "loss": 3.0507, "step": 361000 }, { "epoch": 0.8704074120668805, "grad_norm": 16.389257431030273, "learning_rate": 7.220116161791813e-06, "loss": 3.0169, "step": 361500 }, { "epoch": 0.8716112950711224, "grad_norm": 15.485569953918457, "learning_rate": 7.153233508744238e-06, "loss": 3.016, "step": 362000 }, { "epoch": 0.8728151780753642, "grad_norm": 18.530200958251953, "learning_rate": 7.086350855696663e-06, "loss": 3.0083, "step": 362500 }, { "epoch": 0.8740190610796061, "grad_norm": 14.700156211853027, "learning_rate": 7.0196019679551835e-06, "loss": 2.9861, "step": 363000 }, { "epoch": 0.875222944083848, "grad_norm": 19.87506675720215, "learning_rate": 6.952719314907609e-06, "loss": 3.0287, "step": 363500 }, { "epoch": 0.8764268270880899, "grad_norm": 25.59213638305664, "learning_rate": 6.885836661860033e-06, "loss": 3.0149, "step": 364000 }, { "epoch": 0.8776307100923318, "grad_norm": 16.81450653076172, "learning_rate": 6.818954008812459e-06, "loss": 3.0167, "step": 364500 }, { "epoch": 0.8788345930965736, "grad_norm": 20.761167526245117, "learning_rate": 6.7522051210709786e-06, "loss": 3.037, "step": 365000 }, { "epoch": 0.8788345930965736, "eval_runtime": 6325.2523, "eval_samples_per_second": 131.322, "eval_steps_per_second": 32.831, "step": 365000 }, { "epoch": 0.8800384761008155, "grad_norm": 18.997737884521484, "learning_rate": 6.685322468023404e-06, "loss": 3.0299, "step": 365500 }, { "epoch": 0.8812423591050574, "grad_norm": 18.71440315246582, "learning_rate": 6.618439814975828e-06, "loss": 3.018, "step": 366000 }, { "epoch": 0.8824462421092993, "grad_norm": 17.6945858001709, "learning_rate": 6.551557161928254e-06, "loss": 3.0215, "step": 366500 }, { "epoch": 0.8836501251135412, "grad_norm": 17.693279266357422, "learning_rate": 6.48467450888068e-06, "loss": 3.0327, "step": 367000 }, { "epoch": 0.8848540081177831, "grad_norm": 12.849013328552246, "learning_rate": 6.417791855833104e-06, "loss": 3.0219, "step": 367500 }, { "epoch": 0.8860578911220249, "grad_norm": 15.688241958618164, "learning_rate": 6.351042968091623e-06, "loss": 3.0481, "step": 368000 }, { "epoch": 0.8872617741262668, "grad_norm": 16.61380958557129, "learning_rate": 6.284160315044049e-06, "loss": 2.9957, "step": 368500 }, { "epoch": 0.8884656571305087, "grad_norm": 14.891318321228027, "learning_rate": 6.217277661996474e-06, "loss": 3.0474, "step": 369000 }, { "epoch": 0.8896695401347506, "grad_norm": 20.029443740844727, "learning_rate": 6.150395008948899e-06, "loss": 3.0241, "step": 369500 }, { "epoch": 0.8908734231389925, "grad_norm": 13.43873119354248, "learning_rate": 6.083512355901325e-06, "loss": 3.0318, "step": 370000 }, { "epoch": 0.8908734231389925, "eval_runtime": 6403.6333, "eval_samples_per_second": 129.715, "eval_steps_per_second": 32.429, "step": 370000 }, { "epoch": 0.8920773061432344, "grad_norm": 16.173236846923828, "learning_rate": 6.016763468159844e-06, "loss": 3.0562, "step": 370500 }, { "epoch": 0.8932811891474762, "grad_norm": 21.55840301513672, "learning_rate": 5.949880815112269e-06, "loss": 2.9967, "step": 371000 }, { "epoch": 0.8944850721517181, "grad_norm": 15.276843070983887, "learning_rate": 5.882998162064694e-06, "loss": 3.0263, "step": 371500 }, { "epoch": 0.89568895515596, "grad_norm": 15.087631225585938, "learning_rate": 5.81611550901712e-06, "loss": 2.9793, "step": 372000 }, { "epoch": 0.8968928381602019, "grad_norm": 12.954302787780762, "learning_rate": 5.749232855969545e-06, "loss": 3.0192, "step": 372500 }, { "epoch": 0.8980967211644438, "grad_norm": 20.37034797668457, "learning_rate": 5.6823502029219695e-06, "loss": 3.0274, "step": 373000 }, { "epoch": 0.8993006041686856, "grad_norm": 16.947673797607422, "learning_rate": 5.61560131518049e-06, "loss": 3.0792, "step": 373500 }, { "epoch": 0.9005044871729275, "grad_norm": 14.517135620117188, "learning_rate": 5.548718662132915e-06, "loss": 2.9878, "step": 374000 }, { "epoch": 0.9017083701771694, "grad_norm": 15.187361717224121, "learning_rate": 5.48183600908534e-06, "loss": 3.0541, "step": 374500 }, { "epoch": 0.9029122531814113, "grad_norm": 15.383942604064941, "learning_rate": 5.4149533560377646e-06, "loss": 3.021, "step": 375000 }, { "epoch": 0.9029122531814113, "eval_runtime": 6390.605, "eval_samples_per_second": 129.979, "eval_steps_per_second": 32.495, "step": 375000 }, { "epoch": 0.9041161361856532, "grad_norm": 17.510334014892578, "learning_rate": 5.3480707029901895e-06, "loss": 3.0943, "step": 375500 }, { "epoch": 0.9053200191898951, "grad_norm": 16.601346969604492, "learning_rate": 5.281321815248711e-06, "loss": 3.0723, "step": 376000 }, { "epoch": 0.9065239021941369, "grad_norm": 22.802818298339844, "learning_rate": 5.2144391622011356e-06, "loss": 3.0491, "step": 376500 }, { "epoch": 0.9077277851983788, "grad_norm": 17.018939971923828, "learning_rate": 5.1475565091535605e-06, "loss": 3.0556, "step": 377000 }, { "epoch": 0.9089316682026207, "grad_norm": 19.08505630493164, "learning_rate": 5.080673856105985e-06, "loss": 3.046, "step": 377500 }, { "epoch": 0.9101355512068626, "grad_norm": 16.25370216369629, "learning_rate": 5.01379120305841e-06, "loss": 3.0191, "step": 378000 }, { "epoch": 0.9113394342111045, "grad_norm": 16.954275131225586, "learning_rate": 4.946908550010835e-06, "loss": 3.0025, "step": 378500 }, { "epoch": 0.9125433172153464, "grad_norm": 26.870176315307617, "learning_rate": 4.8801596622693555e-06, "loss": 3.0288, "step": 379000 }, { "epoch": 0.9137472002195882, "grad_norm": 14.162908554077148, "learning_rate": 4.81327700922178e-06, "loss": 3.0278, "step": 379500 }, { "epoch": 0.9149510832238301, "grad_norm": 16.129444122314453, "learning_rate": 4.746394356174205e-06, "loss": 3.0409, "step": 380000 }, { "epoch": 0.9149510832238301, "eval_runtime": 6329.5669, "eval_samples_per_second": 131.233, "eval_steps_per_second": 32.808, "step": 380000 }, { "epoch": 0.916154966228072, "grad_norm": 19.689468383789062, "learning_rate": 4.67951170312663e-06, "loss": 3.0399, "step": 380500 }, { "epoch": 0.9173588492323139, "grad_norm": 17.123493194580078, "learning_rate": 4.6127628153851506e-06, "loss": 3.0142, "step": 381000 }, { "epoch": 0.9185627322365558, "grad_norm": 15.44541072845459, "learning_rate": 4.5458801623375755e-06, "loss": 2.9937, "step": 381500 }, { "epoch": 0.9197666152407976, "grad_norm": 20.037689208984375, "learning_rate": 4.47899750929e-06, "loss": 3.0889, "step": 382000 }, { "epoch": 0.9209704982450395, "grad_norm": 17.4291934967041, "learning_rate": 4.412114856242425e-06, "loss": 2.9653, "step": 382500 }, { "epoch": 0.9221743812492814, "grad_norm": 18.911190032958984, "learning_rate": 4.345232203194851e-06, "loss": 3.0299, "step": 383000 }, { "epoch": 0.9233782642535233, "grad_norm": 18.403993606567383, "learning_rate": 4.278349550147276e-06, "loss": 3.0437, "step": 383500 }, { "epoch": 0.9245821472577652, "grad_norm": 17.68988800048828, "learning_rate": 4.211600662405795e-06, "loss": 2.9655, "step": 384000 }, { "epoch": 0.9257860302620071, "grad_norm": 15.752707481384277, "learning_rate": 4.144718009358221e-06, "loss": 3.0118, "step": 384500 }, { "epoch": 0.9269899132662489, "grad_norm": 15.633676528930664, "learning_rate": 4.077835356310646e-06, "loss": 2.98, "step": 385000 }, { "epoch": 0.9269899132662489, "eval_runtime": 6416.4378, "eval_samples_per_second": 129.456, "eval_steps_per_second": 32.364, "step": 385000 }, { "epoch": 0.9281937962704908, "grad_norm": 22.764881134033203, "learning_rate": 4.010952703263071e-06, "loss": 3.0464, "step": 385500 }, { "epoch": 0.9293976792747327, "grad_norm": 16.236614227294922, "learning_rate": 3.944070050215496e-06, "loss": 3.0362, "step": 386000 }, { "epoch": 0.9306015622789746, "grad_norm": 14.50631332397461, "learning_rate": 3.877321162474017e-06, "loss": 3.071, "step": 386500 }, { "epoch": 0.9318054452832165, "grad_norm": 13.831846237182617, "learning_rate": 3.8104385094264415e-06, "loss": 3.0001, "step": 387000 }, { "epoch": 0.9330093282874584, "grad_norm": 12.26697826385498, "learning_rate": 3.7435558563788664e-06, "loss": 3.0437, "step": 387500 }, { "epoch": 0.9342132112917002, "grad_norm": 20.174835205078125, "learning_rate": 3.6766732033312913e-06, "loss": 3.0136, "step": 388000 }, { "epoch": 0.9354170942959421, "grad_norm": 19.26807975769043, "learning_rate": 3.609790550283716e-06, "loss": 3.0054, "step": 388500 }, { "epoch": 0.936620977300184, "grad_norm": 13.987044334411621, "learning_rate": 3.542907897236141e-06, "loss": 3.004, "step": 389000 }, { "epoch": 0.9378248603044259, "grad_norm": 19.408586502075195, "learning_rate": 3.476025244188567e-06, "loss": 3.0438, "step": 389500 }, { "epoch": 0.9390287433086678, "grad_norm": 20.116239547729492, "learning_rate": 3.4092763564470868e-06, "loss": 3.0043, "step": 390000 }, { "epoch": 0.9390287433086678, "eval_runtime": 6418.8738, "eval_samples_per_second": 129.407, "eval_steps_per_second": 32.352, "step": 390000 }, { "epoch": 0.9402326263129097, "grad_norm": 15.818509101867676, "learning_rate": 3.3423937033995117e-06, "loss": 2.9467, "step": 390500 }, { "epoch": 0.9414365093171515, "grad_norm": 17.208309173583984, "learning_rate": 3.2755110503519366e-06, "loss": 3.0507, "step": 391000 }, { "epoch": 0.9426403923213934, "grad_norm": 14.738162994384766, "learning_rate": 3.208628397304362e-06, "loss": 3.0281, "step": 391500 }, { "epoch": 0.9438442753256353, "grad_norm": 15.624344825744629, "learning_rate": 3.141879509562882e-06, "loss": 3.0248, "step": 392000 }, { "epoch": 0.9450481583298772, "grad_norm": 17.159011840820312, "learning_rate": 3.074996856515307e-06, "loss": 2.9597, "step": 392500 }, { "epoch": 0.9462520413341191, "grad_norm": 13.915901184082031, "learning_rate": 3.008114203467732e-06, "loss": 2.9937, "step": 393000 }, { "epoch": 0.9474559243383609, "grad_norm": 20.13627052307129, "learning_rate": 2.941231550420157e-06, "loss": 2.9966, "step": 393500 }, { "epoch": 0.9486598073426028, "grad_norm": 26.449026107788086, "learning_rate": 2.8743488973725822e-06, "loss": 2.9904, "step": 394000 }, { "epoch": 0.9498636903468447, "grad_norm": 18.189252853393555, "learning_rate": 2.807600009631102e-06, "loss": 3.0078, "step": 394500 }, { "epoch": 0.9510675733510866, "grad_norm": 20.91954803466797, "learning_rate": 2.740717356583527e-06, "loss": 3.0439, "step": 395000 }, { "epoch": 0.9510675733510866, "eval_runtime": 6350.5821, "eval_samples_per_second": 130.798, "eval_steps_per_second": 32.7, "step": 395000 }, { "epoch": 0.9522714563553285, "grad_norm": 18.318206787109375, "learning_rate": 2.673834703535952e-06, "loss": 2.989, "step": 395500 }, { "epoch": 0.9534753393595704, "grad_norm": 16.19314193725586, "learning_rate": 2.6069520504883773e-06, "loss": 2.9842, "step": 396000 }, { "epoch": 0.9546792223638122, "grad_norm": 16.36551856994629, "learning_rate": 2.540069397440802e-06, "loss": 2.9938, "step": 396500 }, { "epoch": 0.9558831053680541, "grad_norm": 19.816038131713867, "learning_rate": 2.473186744393227e-06, "loss": 3.0204, "step": 397000 }, { "epoch": 0.957086988372296, "grad_norm": 14.318347930908203, "learning_rate": 2.4064378566517474e-06, "loss": 3.0851, "step": 397500 }, { "epoch": 0.9582908713765379, "grad_norm": 17.114421844482422, "learning_rate": 2.3395552036041728e-06, "loss": 3.0096, "step": 398000 }, { "epoch": 0.9594947543807798, "grad_norm": 18.27849578857422, "learning_rate": 2.2726725505565977e-06, "loss": 3.0374, "step": 398500 }, { "epoch": 0.9606986373850217, "grad_norm": 16.87068748474121, "learning_rate": 2.2057898975090225e-06, "loss": 3.0484, "step": 399000 }, { "epoch": 0.9619025203892635, "grad_norm": 22.162954330444336, "learning_rate": 2.138907244461448e-06, "loss": 3.04, "step": 399500 }, { "epoch": 0.9631064033935054, "grad_norm": 16.329286575317383, "learning_rate": 2.0720245914138728e-06, "loss": 2.9491, "step": 400000 }, { "epoch": 0.9631064033935054, "eval_runtime": 6424.6879, "eval_samples_per_second": 129.29, "eval_steps_per_second": 32.323, "step": 400000 }, { "epoch": 0.9643102863977473, "grad_norm": 16.189512252807617, "learning_rate": 2.0052757036723927e-06, "loss": 3.045, "step": 400500 }, { "epoch": 0.9655141694019892, "grad_norm": 18.53325080871582, "learning_rate": 1.9383930506248176e-06, "loss": 3.0405, "step": 401000 }, { "epoch": 0.9667180524062311, "grad_norm": 21.920936584472656, "learning_rate": 1.871510397577243e-06, "loss": 3.0347, "step": 401500 }, { "epoch": 0.9679219354104729, "grad_norm": 10.188512802124023, "learning_rate": 1.8046277445296678e-06, "loss": 2.9497, "step": 402000 }, { "epoch": 0.9691258184147148, "grad_norm": 23.691808700561523, "learning_rate": 1.7377450914820931e-06, "loss": 3.0046, "step": 402500 }, { "epoch": 0.9703297014189567, "grad_norm": 17.392013549804688, "learning_rate": 1.6709962037406129e-06, "loss": 2.996, "step": 403000 }, { "epoch": 0.9715335844231986, "grad_norm": 19.87090492248535, "learning_rate": 1.6041135506930382e-06, "loss": 3.042, "step": 403500 }, { "epoch": 0.9727374674274405, "grad_norm": 19.895801544189453, "learning_rate": 1.537230897645463e-06, "loss": 2.978, "step": 404000 }, { "epoch": 0.9739413504316824, "grad_norm": 16.795654296875, "learning_rate": 1.4703482445978882e-06, "loss": 3.0219, "step": 404500 }, { "epoch": 0.9751452334359242, "grad_norm": 13.37932014465332, "learning_rate": 1.4034655915503133e-06, "loss": 3.0323, "step": 405000 }, { "epoch": 0.9751452334359242, "eval_runtime": 6375.6502, "eval_samples_per_second": 130.284, "eval_steps_per_second": 32.571, "step": 405000 }, { "epoch": 0.9763491164401661, "grad_norm": 14.84689712524414, "learning_rate": 1.3367167038088334e-06, "loss": 2.9709, "step": 405500 }, { "epoch": 0.977552999444408, "grad_norm": 14.532979011535645, "learning_rate": 1.2698340507612583e-06, "loss": 3.0614, "step": 406000 }, { "epoch": 0.9787568824486499, "grad_norm": 15.914132118225098, "learning_rate": 1.2029513977136834e-06, "loss": 3.0498, "step": 406500 }, { "epoch": 0.9799607654528918, "grad_norm": 14.478850364685059, "learning_rate": 1.1360687446661085e-06, "loss": 2.9675, "step": 407000 }, { "epoch": 0.9811646484571337, "grad_norm": 22.8538818359375, "learning_rate": 1.0691860916185336e-06, "loss": 3.0232, "step": 407500 }, { "epoch": 0.9823685314613755, "grad_norm": 15.004932403564453, "learning_rate": 1.0023034385709585e-06, "loss": 2.9698, "step": 408000 }, { "epoch": 0.9835724144656174, "grad_norm": 15.036443710327148, "learning_rate": 9.354207855233835e-07, "loss": 2.9961, "step": 408500 }, { "epoch": 0.9847762974698593, "grad_norm": 19.975051879882812, "learning_rate": 8.685381324758087e-07, "loss": 3.0067, "step": 409000 }, { "epoch": 0.9859801804741012, "grad_norm": 17.99605369567871, "learning_rate": 8.017892447343288e-07, "loss": 3.0702, "step": 409500 }, { "epoch": 0.9871840634783431, "grad_norm": 15.935543060302734, "learning_rate": 7.349065916867538e-07, "loss": 3.0132, "step": 410000 }, { "epoch": 0.9871840634783431, "eval_runtime": 6347.5902, "eval_samples_per_second": 130.86, "eval_steps_per_second": 32.715, "step": 410000 }, { "epoch": 0.9883879464825849, "grad_norm": 12.5308256149292, "learning_rate": 6.680239386391788e-07, "loss": 2.977, "step": 410500 }, { "epoch": 0.9895918294868268, "grad_norm": 15.325048446655273, "learning_rate": 6.011412855916039e-07, "loss": 3.0383, "step": 411000 }, { "epoch": 0.9907957124910687, "grad_norm": 16.378740310668945, "learning_rate": 5.342586325440288e-07, "loss": 3.0278, "step": 411500 }, { "epoch": 0.9919995954953106, "grad_norm": 17.669631958007812, "learning_rate": 4.673759794964539e-07, "loss": 2.9929, "step": 412000 }, { "epoch": 0.9932034784995525, "grad_norm": 16.54693603515625, "learning_rate": 4.004933264488789e-07, "loss": 2.9768, "step": 412500 }, { "epoch": 0.9944073615037944, "grad_norm": 16.434072494506836, "learning_rate": 3.337444387073991e-07, "loss": 3.0664, "step": 413000 }, { "epoch": 0.9956112445080362, "grad_norm": 28.83799171447754, "learning_rate": 2.6686178565982417e-07, "loss": 3.0716, "step": 413500 }, { "epoch": 0.9968151275122781, "grad_norm": 15.776455879211426, "learning_rate": 1.9997913261224917e-07, "loss": 3.0396, "step": 414000 }, { "epoch": 0.99801901051652, "grad_norm": 18.937358856201172, "learning_rate": 1.330964795646742e-07, "loss": 3.0165, "step": 414500 }, { "epoch": 0.9992228935207619, "grad_norm": 20.05877685546875, "learning_rate": 6.621382651709922e-08, "loss": 3.0059, "step": 415000 }, { "epoch": 0.9992228935207619, "eval_runtime": 6436.495, "eval_samples_per_second": 129.052, "eval_steps_per_second": 32.263, "step": 415000 } ], "logging_steps": 500, "max_steps": 415322, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1124779666016266e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }