{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 4054, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00493339911198816, "grad_norm": 32.17447280883789, "learning_rate": 2.5e-05, "loss": 4.0607, "step": 10 }, { "epoch": 0.00986679822397632, "grad_norm": 30.161712646484375, "learning_rate": 4.995882062263219e-05, "loss": 3.7481, "step": 20 }, { "epoch": 0.01480019733596448, "grad_norm": 38.9721565246582, "learning_rate": 4.9884697743370127e-05, "loss": 3.9839, "step": 30 }, { "epoch": 0.01973359644795264, "grad_norm": 29.38846778869629, "learning_rate": 4.9802338988634494e-05, "loss": 3.7565, "step": 40 }, { "epoch": 0.0246669955599408, "grad_norm": 32.933860778808594, "learning_rate": 4.9728216109372424e-05, "loss": 3.7562, "step": 50 }, { "epoch": 0.02960039467192896, "grad_norm": 24.409399032592773, "learning_rate": 4.96458573546368e-05, "loss": 3.3008, "step": 60 }, { "epoch": 0.03453379378391712, "grad_norm": 25.322418212890625, "learning_rate": 4.956349859990117e-05, "loss": 3.0558, "step": 70 }, { "epoch": 0.03946719289590528, "grad_norm": 28.60622215270996, "learning_rate": 4.948113984516554e-05, "loss": 3.4003, "step": 80 }, { "epoch": 0.04440059200789344, "grad_norm": 15.609489440917969, "learning_rate": 4.9398781090429915e-05, "loss": 3.0121, "step": 90 }, { "epoch": 0.0493339911198816, "grad_norm": 22.34126091003418, "learning_rate": 4.931642233569428e-05, "loss": 3.1712, "step": 100 }, { "epoch": 0.05426739023186976, "grad_norm": 19.47034454345703, "learning_rate": 4.923406358095866e-05, "loss": 3.2003, "step": 110 }, { "epoch": 0.05920078934385792, "grad_norm": 20.92117691040039, "learning_rate": 4.9151704826223025e-05, "loss": 3.2641, "step": 120 }, { "epoch": 0.06413418845584608, "grad_norm": 10.978850364685059, "learning_rate": 4.90693460714874e-05, "loss": 3.0391, "step": 130 }, { "epoch": 0.06906758756783424, "grad_norm": 12.89031982421875, "learning_rate": 4.8986987316751774e-05, "loss": 2.7204, "step": 140 }, { "epoch": 0.0740009866798224, "grad_norm": 9.097904205322266, "learning_rate": 4.890462856201615e-05, "loss": 2.9097, "step": 150 }, { "epoch": 0.07893438579181056, "grad_norm": 10.250645637512207, "learning_rate": 4.8822269807280516e-05, "loss": 2.8115, "step": 160 }, { "epoch": 0.08386778490379872, "grad_norm": 9.343478202819824, "learning_rate": 4.873991105254489e-05, "loss": 3.2778, "step": 170 }, { "epoch": 0.08880118401578688, "grad_norm": 10.233019828796387, "learning_rate": 4.8657552297809264e-05, "loss": 2.8652, "step": 180 }, { "epoch": 0.09373458312777504, "grad_norm": 9.129461288452148, "learning_rate": 4.857519354307363e-05, "loss": 2.7035, "step": 190 }, { "epoch": 0.0986679822397632, "grad_norm": 6.655858039855957, "learning_rate": 4.8492834788338006e-05, "loss": 2.8944, "step": 200 }, { "epoch": 0.10360138135175136, "grad_norm": 8.336820602416992, "learning_rate": 4.8410476033602374e-05, "loss": 2.4224, "step": 210 }, { "epoch": 0.10853478046373952, "grad_norm": 16.6269474029541, "learning_rate": 4.832811727886675e-05, "loss": 2.9059, "step": 220 }, { "epoch": 0.11346817957572768, "grad_norm": 14.81032657623291, "learning_rate": 4.8245758524131116e-05, "loss": 3.026, "step": 230 }, { "epoch": 0.11840157868771584, "grad_norm": 5.8194193840026855, "learning_rate": 4.816339976939549e-05, "loss": 2.6034, "step": 240 }, { "epoch": 0.123334977799704, "grad_norm": 12.556567192077637, "learning_rate": 4.808104101465986e-05, "loss": 2.9635, "step": 250 }, { "epoch": 0.12826837691169216, "grad_norm": 11.739566802978516, "learning_rate": 4.799868225992423e-05, "loss": 3.1042, "step": 260 }, { "epoch": 0.13320177602368033, "grad_norm": 9.672442436218262, "learning_rate": 4.791632350518861e-05, "loss": 3.0241, "step": 270 }, { "epoch": 0.13813517513566848, "grad_norm": 5.8154497146606445, "learning_rate": 4.7833964750452975e-05, "loss": 2.7709, "step": 280 }, { "epoch": 0.14306857424765665, "grad_norm": 20.954362869262695, "learning_rate": 4.775160599571735e-05, "loss": 2.8549, "step": 290 }, { "epoch": 0.1480019733596448, "grad_norm": 9.915385246276855, "learning_rate": 4.7669247240981717e-05, "loss": 2.9146, "step": 300 }, { "epoch": 0.15293537247163297, "grad_norm": 7.093989372253418, "learning_rate": 4.758688848624609e-05, "loss": 2.6771, "step": 310 }, { "epoch": 0.15786877158362111, "grad_norm": 8.882306098937988, "learning_rate": 4.750452973151046e-05, "loss": 2.6295, "step": 320 }, { "epoch": 0.1628021706956093, "grad_norm": 17.28571128845215, "learning_rate": 4.742217097677483e-05, "loss": 2.492, "step": 330 }, { "epoch": 0.16773556980759743, "grad_norm": 18.01361846923828, "learning_rate": 4.73398122220392e-05, "loss": 3.288, "step": 340 }, { "epoch": 0.1726689689195856, "grad_norm": 10.41086196899414, "learning_rate": 4.7257453467303575e-05, "loss": 2.6676, "step": 350 }, { "epoch": 0.17760236803157375, "grad_norm": 12.40413761138916, "learning_rate": 4.717509471256795e-05, "loss": 2.8085, "step": 360 }, { "epoch": 0.18253576714356193, "grad_norm": 13.229728698730469, "learning_rate": 4.709273595783232e-05, "loss": 2.2978, "step": 370 }, { "epoch": 0.18746916625555007, "grad_norm": 7.240847587585449, "learning_rate": 4.701037720309669e-05, "loss": 2.7408, "step": 380 }, { "epoch": 0.19240256536753825, "grad_norm": 10.95034122467041, "learning_rate": 4.692801844836106e-05, "loss": 2.7323, "step": 390 }, { "epoch": 0.1973359644795264, "grad_norm": 11.429134368896484, "learning_rate": 4.6845659693625434e-05, "loss": 2.6121, "step": 400 }, { "epoch": 0.20226936359151457, "grad_norm": 14.395548820495605, "learning_rate": 4.67633009388898e-05, "loss": 3.0402, "step": 410 }, { "epoch": 0.2072027627035027, "grad_norm": 5.770746231079102, "learning_rate": 4.668094218415418e-05, "loss": 2.4771, "step": 420 }, { "epoch": 0.2121361618154909, "grad_norm": 8.25928783416748, "learning_rate": 4.659858342941855e-05, "loss": 2.1011, "step": 430 }, { "epoch": 0.21706956092747903, "grad_norm": 14.545477867126465, "learning_rate": 4.6516224674682924e-05, "loss": 3.0616, "step": 440 }, { "epoch": 0.2220029600394672, "grad_norm": 11.545022964477539, "learning_rate": 4.643386591994729e-05, "loss": 2.2621, "step": 450 }, { "epoch": 0.22693635915145535, "grad_norm": 25.751686096191406, "learning_rate": 4.6351507165211666e-05, "loss": 2.7998, "step": 460 }, { "epoch": 0.23186975826344353, "grad_norm": 8.96506118774414, "learning_rate": 4.626914841047604e-05, "loss": 2.2543, "step": 470 }, { "epoch": 0.23680315737543167, "grad_norm": 8.918927192687988, "learning_rate": 4.618678965574041e-05, "loss": 2.2573, "step": 480 }, { "epoch": 0.24173655648741985, "grad_norm": 8.464729309082031, "learning_rate": 4.610443090100478e-05, "loss": 2.7079, "step": 490 }, { "epoch": 0.246669955599408, "grad_norm": 9.733293533325195, "learning_rate": 4.602207214626915e-05, "loss": 3.423, "step": 500 }, { "epoch": 0.25160335471139617, "grad_norm": 11.454011917114258, "learning_rate": 4.5939713391533525e-05, "loss": 3.1205, "step": 510 }, { "epoch": 0.2565367538233843, "grad_norm": 8.79787826538086, "learning_rate": 4.585735463679789e-05, "loss": 2.4529, "step": 520 }, { "epoch": 0.26147015293537246, "grad_norm": 6.300372123718262, "learning_rate": 4.577499588206227e-05, "loss": 2.3711, "step": 530 }, { "epoch": 0.26640355204736066, "grad_norm": 5.140571117401123, "learning_rate": 4.5692637127326634e-05, "loss": 2.3833, "step": 540 }, { "epoch": 0.2713369511593488, "grad_norm": 11.252778053283691, "learning_rate": 4.561027837259101e-05, "loss": 2.3281, "step": 550 }, { "epoch": 0.27627035027133695, "grad_norm": 10.254585266113281, "learning_rate": 4.552791961785538e-05, "loss": 3.0383, "step": 560 }, { "epoch": 0.2812037493833251, "grad_norm": 8.8512544631958, "learning_rate": 4.544556086311975e-05, "loss": 2.3813, "step": 570 }, { "epoch": 0.2861371484953133, "grad_norm": 4.089067459106445, "learning_rate": 4.5363202108384125e-05, "loss": 2.5559, "step": 580 }, { "epoch": 0.29107054760730144, "grad_norm": 5.320517539978027, "learning_rate": 4.528084335364849e-05, "loss": 2.7508, "step": 590 }, { "epoch": 0.2960039467192896, "grad_norm": 6.5414862632751465, "learning_rate": 4.519848459891287e-05, "loss": 2.3911, "step": 600 }, { "epoch": 0.30093734583127774, "grad_norm": 7.56878137588501, "learning_rate": 4.5116125844177235e-05, "loss": 2.9435, "step": 610 }, { "epoch": 0.30587074494326594, "grad_norm": 10.586882591247559, "learning_rate": 4.503376708944161e-05, "loss": 2.6231, "step": 620 }, { "epoch": 0.3108041440552541, "grad_norm": 9.455619812011719, "learning_rate": 4.495140833470598e-05, "loss": 2.8628, "step": 630 }, { "epoch": 0.31573754316724223, "grad_norm": 5.5599565505981445, "learning_rate": 4.486904957997035e-05, "loss": 2.2629, "step": 640 }, { "epoch": 0.3206709422792304, "grad_norm": 5.07106351852417, "learning_rate": 4.4786690825234726e-05, "loss": 2.1062, "step": 650 }, { "epoch": 0.3256043413912186, "grad_norm": 8.023340225219727, "learning_rate": 4.470433207049909e-05, "loss": 2.4445, "step": 660 }, { "epoch": 0.3305377405032067, "grad_norm": 12.941463470458984, "learning_rate": 4.462197331576347e-05, "loss": 2.3829, "step": 670 }, { "epoch": 0.33547113961519487, "grad_norm": 9.589372634887695, "learning_rate": 4.4539614561027835e-05, "loss": 2.8525, "step": 680 }, { "epoch": 0.340404538727183, "grad_norm": 29.511812210083008, "learning_rate": 4.445725580629221e-05, "loss": 2.4082, "step": 690 }, { "epoch": 0.3453379378391712, "grad_norm": 17.187841415405273, "learning_rate": 4.437489705155658e-05, "loss": 2.3018, "step": 700 }, { "epoch": 0.35027133695115936, "grad_norm": 6.561307430267334, "learning_rate": 4.429253829682096e-05, "loss": 2.4964, "step": 710 }, { "epoch": 0.3552047360631475, "grad_norm": 8.075435638427734, "learning_rate": 4.4210179542085326e-05, "loss": 2.4871, "step": 720 }, { "epoch": 0.36013813517513565, "grad_norm": 8.039571762084961, "learning_rate": 4.41278207873497e-05, "loss": 2.4563, "step": 730 }, { "epoch": 0.36507153428712386, "grad_norm": 24.152629852294922, "learning_rate": 4.404546203261407e-05, "loss": 2.549, "step": 740 }, { "epoch": 0.370004933399112, "grad_norm": 2.6132395267486572, "learning_rate": 4.396310327787844e-05, "loss": 2.6625, "step": 750 }, { "epoch": 0.37493833251110015, "grad_norm": 11.909361839294434, "learning_rate": 4.388074452314282e-05, "loss": 3.0034, "step": 760 }, { "epoch": 0.3798717316230883, "grad_norm": 6.0602240562438965, "learning_rate": 4.3798385768407185e-05, "loss": 2.8432, "step": 770 }, { "epoch": 0.3848051307350765, "grad_norm": 6.063085079193115, "learning_rate": 4.371602701367156e-05, "loss": 2.4882, "step": 780 }, { "epoch": 0.38973852984706464, "grad_norm": 12.514052391052246, "learning_rate": 4.363366825893593e-05, "loss": 2.5734, "step": 790 }, { "epoch": 0.3946719289590528, "grad_norm": 5.626593589782715, "learning_rate": 4.35513095042003e-05, "loss": 1.8927, "step": 800 }, { "epoch": 0.39960532807104093, "grad_norm": 6.073607444763184, "learning_rate": 4.346895074946467e-05, "loss": 2.4321, "step": 810 }, { "epoch": 0.40453872718302913, "grad_norm": 7.1377763748168945, "learning_rate": 4.338659199472904e-05, "loss": 3.0152, "step": 820 }, { "epoch": 0.4094721262950173, "grad_norm": 5.029105186462402, "learning_rate": 4.330423323999341e-05, "loss": 2.3915, "step": 830 }, { "epoch": 0.4144055254070054, "grad_norm": 5.954276084899902, "learning_rate": 4.3221874485257785e-05, "loss": 2.5008, "step": 840 }, { "epoch": 0.41933892451899357, "grad_norm": 7.565403938293457, "learning_rate": 4.313951573052216e-05, "loss": 2.5035, "step": 850 }, { "epoch": 0.4242723236309818, "grad_norm": 7.7029314041137695, "learning_rate": 4.305715697578653e-05, "loss": 2.4186, "step": 860 }, { "epoch": 0.4292057227429699, "grad_norm": 8.457364082336426, "learning_rate": 4.29747982210509e-05, "loss": 2.5165, "step": 870 }, { "epoch": 0.43413912185495807, "grad_norm": 10.918882369995117, "learning_rate": 4.289243946631527e-05, "loss": 2.4591, "step": 880 }, { "epoch": 0.4390725209669462, "grad_norm": 9.553812026977539, "learning_rate": 4.2810080711579644e-05, "loss": 2.6194, "step": 890 }, { "epoch": 0.4440059200789344, "grad_norm": 9.679872512817383, "learning_rate": 4.272772195684401e-05, "loss": 2.9103, "step": 900 }, { "epoch": 0.44893931919092256, "grad_norm": 5.805199146270752, "learning_rate": 4.2645363202108386e-05, "loss": 2.202, "step": 910 }, { "epoch": 0.4538727183029107, "grad_norm": 7.699798107147217, "learning_rate": 4.256300444737275e-05, "loss": 3.0242, "step": 920 }, { "epoch": 0.45880611741489885, "grad_norm": 5.121403217315674, "learning_rate": 4.248064569263713e-05, "loss": 2.2204, "step": 930 }, { "epoch": 0.46373951652688705, "grad_norm": 5.217260837554932, "learning_rate": 4.23982869379015e-05, "loss": 2.6708, "step": 940 }, { "epoch": 0.4686729156388752, "grad_norm": 9.238679885864258, "learning_rate": 4.231592818316587e-05, "loss": 2.7795, "step": 950 }, { "epoch": 0.47360631475086334, "grad_norm": 4.4800496101379395, "learning_rate": 4.2233569428430244e-05, "loss": 2.5765, "step": 960 }, { "epoch": 0.4785397138628515, "grad_norm": 11.48628044128418, "learning_rate": 4.215121067369461e-05, "loss": 2.7761, "step": 970 }, { "epoch": 0.4834731129748397, "grad_norm": 8.910355567932129, "learning_rate": 4.2068851918958986e-05, "loss": 2.8094, "step": 980 }, { "epoch": 0.48840651208682784, "grad_norm": 7.16322660446167, "learning_rate": 4.198649316422336e-05, "loss": 2.3914, "step": 990 }, { "epoch": 0.493339911198816, "grad_norm": 15.859028816223145, "learning_rate": 4.1904134409487735e-05, "loss": 2.5077, "step": 1000 }, { "epoch": 0.49827331031080413, "grad_norm": 10.420004844665527, "learning_rate": 4.18217756547521e-05, "loss": 2.4869, "step": 1010 }, { "epoch": 0.5032067094227923, "grad_norm": 5.477315902709961, "learning_rate": 4.173941690001648e-05, "loss": 2.8359, "step": 1020 }, { "epoch": 0.5081401085347804, "grad_norm": 5.169394493103027, "learning_rate": 4.1657058145280845e-05, "loss": 2.4023, "step": 1030 }, { "epoch": 0.5130735076467686, "grad_norm": 8.969653129577637, "learning_rate": 4.157469939054522e-05, "loss": 2.7727, "step": 1040 }, { "epoch": 0.5180069067587568, "grad_norm": 7.325026512145996, "learning_rate": 4.149234063580959e-05, "loss": 2.3592, "step": 1050 }, { "epoch": 0.5229403058707449, "grad_norm": 5.182565212249756, "learning_rate": 4.140998188107396e-05, "loss": 2.4861, "step": 1060 }, { "epoch": 0.5278737049827331, "grad_norm": 6.98890495300293, "learning_rate": 4.1327623126338335e-05, "loss": 1.9875, "step": 1070 }, { "epoch": 0.5328071040947213, "grad_norm": 7.717536449432373, "learning_rate": 4.12452643716027e-05, "loss": 1.7559, "step": 1080 }, { "epoch": 0.5377405032067094, "grad_norm": 8.016460418701172, "learning_rate": 4.116290561686708e-05, "loss": 2.8117, "step": 1090 }, { "epoch": 0.5426739023186976, "grad_norm": 6.484166622161865, "learning_rate": 4.1080546862131445e-05, "loss": 2.4036, "step": 1100 }, { "epoch": 0.5476073014306857, "grad_norm": 4.496994495391846, "learning_rate": 4.099818810739582e-05, "loss": 2.4948, "step": 1110 }, { "epoch": 0.5525407005426739, "grad_norm": 6.8036065101623535, "learning_rate": 4.091582935266019e-05, "loss": 2.4636, "step": 1120 }, { "epoch": 0.5574740996546621, "grad_norm": 8.330781936645508, "learning_rate": 4.083347059792456e-05, "loss": 2.6983, "step": 1130 }, { "epoch": 0.5624074987666502, "grad_norm": 6.868759632110596, "learning_rate": 4.0751111843188936e-05, "loss": 2.5327, "step": 1140 }, { "epoch": 0.5673408978786384, "grad_norm": 7.583151340484619, "learning_rate": 4.0668753088453303e-05, "loss": 2.4815, "step": 1150 }, { "epoch": 0.5722742969906266, "grad_norm": 7.823330402374268, "learning_rate": 4.058639433371768e-05, "loss": 2.2532, "step": 1160 }, { "epoch": 0.5772076961026147, "grad_norm": 6.149056434631348, "learning_rate": 4.0504035578982045e-05, "loss": 2.762, "step": 1170 }, { "epoch": 0.5821410952146029, "grad_norm": 9.300018310546875, "learning_rate": 4.042167682424642e-05, "loss": 2.505, "step": 1180 }, { "epoch": 0.587074494326591, "grad_norm": 10.552204132080078, "learning_rate": 4.033931806951079e-05, "loss": 2.6128, "step": 1190 }, { "epoch": 0.5920078934385792, "grad_norm": 10.5072603225708, "learning_rate": 4.025695931477516e-05, "loss": 2.6206, "step": 1200 }, { "epoch": 0.5969412925505674, "grad_norm": 5.355686664581299, "learning_rate": 4.017460056003953e-05, "loss": 2.8745, "step": 1210 }, { "epoch": 0.6018746916625555, "grad_norm": 4.0471649169921875, "learning_rate": 4.0092241805303904e-05, "loss": 2.7503, "step": 1220 }, { "epoch": 0.6068080907745437, "grad_norm": 6.991344928741455, "learning_rate": 4.000988305056828e-05, "loss": 2.686, "step": 1230 }, { "epoch": 0.6117414898865319, "grad_norm": 4.674221515655518, "learning_rate": 3.9927524295832646e-05, "loss": 2.5496, "step": 1240 }, { "epoch": 0.61667488899852, "grad_norm": 23.224143981933594, "learning_rate": 3.984516554109702e-05, "loss": 3.0394, "step": 1250 }, { "epoch": 0.6216082881105082, "grad_norm": 8.688756942749023, "learning_rate": 3.976280678636139e-05, "loss": 2.7851, "step": 1260 }, { "epoch": 0.6265416872224963, "grad_norm": 6.090677261352539, "learning_rate": 3.968044803162576e-05, "loss": 2.2354, "step": 1270 }, { "epoch": 0.6314750863344845, "grad_norm": 9.011785507202148, "learning_rate": 3.959808927689014e-05, "loss": 2.1909, "step": 1280 }, { "epoch": 0.6364084854464727, "grad_norm": 9.320941925048828, "learning_rate": 3.951573052215451e-05, "loss": 2.1964, "step": 1290 }, { "epoch": 0.6413418845584608, "grad_norm": 6.079468250274658, "learning_rate": 3.943337176741888e-05, "loss": 2.1266, "step": 1300 }, { "epoch": 0.646275283670449, "grad_norm": 6.427783489227295, "learning_rate": 3.935101301268325e-05, "loss": 2.1273, "step": 1310 }, { "epoch": 0.6512086827824372, "grad_norm": 5.442935943603516, "learning_rate": 3.926865425794762e-05, "loss": 2.3792, "step": 1320 }, { "epoch": 0.6561420818944252, "grad_norm": 5.503900527954102, "learning_rate": 3.9186295503211995e-05, "loss": 2.3938, "step": 1330 }, { "epoch": 0.6610754810064134, "grad_norm": 9.838761329650879, "learning_rate": 3.910393674847637e-05, "loss": 2.2793, "step": 1340 }, { "epoch": 0.6660088801184015, "grad_norm": 7.890000343322754, "learning_rate": 3.902157799374074e-05, "loss": 2.3842, "step": 1350 }, { "epoch": 0.6709422792303897, "grad_norm": 7.321061134338379, "learning_rate": 3.893921923900511e-05, "loss": 2.3586, "step": 1360 }, { "epoch": 0.6758756783423779, "grad_norm": 5.176478385925293, "learning_rate": 3.885686048426948e-05, "loss": 2.6807, "step": 1370 }, { "epoch": 0.680809077454366, "grad_norm": 4.9928975105285645, "learning_rate": 3.8774501729533854e-05, "loss": 1.8495, "step": 1380 }, { "epoch": 0.6857424765663542, "grad_norm": 6.170323371887207, "learning_rate": 3.869214297479822e-05, "loss": 2.415, "step": 1390 }, { "epoch": 0.6906758756783424, "grad_norm": 10.49080753326416, "learning_rate": 3.8609784220062596e-05, "loss": 1.8006, "step": 1400 }, { "epoch": 0.6956092747903305, "grad_norm": 16.418981552124023, "learning_rate": 3.852742546532696e-05, "loss": 2.5657, "step": 1410 }, { "epoch": 0.7005426739023187, "grad_norm": 4.002269268035889, "learning_rate": 3.844506671059134e-05, "loss": 2.5927, "step": 1420 }, { "epoch": 0.7054760730143068, "grad_norm": 7.075521469116211, "learning_rate": 3.836270795585571e-05, "loss": 1.8786, "step": 1430 }, { "epoch": 0.710409472126295, "grad_norm": 6.2812981605529785, "learning_rate": 3.828034920112008e-05, "loss": 2.1746, "step": 1440 }, { "epoch": 0.7153428712382832, "grad_norm": 7.315672874450684, "learning_rate": 3.8197990446384454e-05, "loss": 2.4418, "step": 1450 }, { "epoch": 0.7202762703502713, "grad_norm": 7.978590488433838, "learning_rate": 3.811563169164882e-05, "loss": 2.3702, "step": 1460 }, { "epoch": 0.7252096694622595, "grad_norm": 8.856497764587402, "learning_rate": 3.8033272936913196e-05, "loss": 2.7627, "step": 1470 }, { "epoch": 0.7301430685742477, "grad_norm": 5.009411811828613, "learning_rate": 3.7950914182177564e-05, "loss": 2.1163, "step": 1480 }, { "epoch": 0.7350764676862358, "grad_norm": 6.4281134605407715, "learning_rate": 3.786855542744194e-05, "loss": 2.554, "step": 1490 }, { "epoch": 0.740009866798224, "grad_norm": 6.119235515594482, "learning_rate": 3.7786196672706306e-05, "loss": 2.1492, "step": 1500 }, { "epoch": 0.7449432659102121, "grad_norm": 4.099040508270264, "learning_rate": 3.770383791797068e-05, "loss": 2.9967, "step": 1510 }, { "epoch": 0.7498766650222003, "grad_norm": 6.364965438842773, "learning_rate": 3.7621479163235055e-05, "loss": 2.7822, "step": 1520 }, { "epoch": 0.7548100641341885, "grad_norm": 13.07193374633789, "learning_rate": 3.753912040849942e-05, "loss": 2.1634, "step": 1530 }, { "epoch": 0.7597434632461766, "grad_norm": 4.285301685333252, "learning_rate": 3.74567616537638e-05, "loss": 2.49, "step": 1540 }, { "epoch": 0.7646768623581648, "grad_norm": 8.684725761413574, "learning_rate": 3.7374402899028164e-05, "loss": 2.4202, "step": 1550 }, { "epoch": 0.769610261470153, "grad_norm": 7.857943058013916, "learning_rate": 3.7292044144292545e-05, "loss": 2.5489, "step": 1560 }, { "epoch": 0.7745436605821411, "grad_norm": 10.40462589263916, "learning_rate": 3.720968538955691e-05, "loss": 2.5975, "step": 1570 }, { "epoch": 0.7794770596941293, "grad_norm": 3.614388942718506, "learning_rate": 3.712732663482129e-05, "loss": 2.5092, "step": 1580 }, { "epoch": 0.7844104588061174, "grad_norm": 4.367038726806641, "learning_rate": 3.7044967880085655e-05, "loss": 2.4972, "step": 1590 }, { "epoch": 0.7893438579181056, "grad_norm": 8.629862785339355, "learning_rate": 3.696260912535003e-05, "loss": 2.8085, "step": 1600 }, { "epoch": 0.7942772570300938, "grad_norm": 4.455677032470703, "learning_rate": 3.68802503706144e-05, "loss": 2.2631, "step": 1610 }, { "epoch": 0.7992106561420819, "grad_norm": 3.8020565509796143, "learning_rate": 3.679789161587877e-05, "loss": 2.1875, "step": 1620 }, { "epoch": 0.8041440552540701, "grad_norm": 8.94434928894043, "learning_rate": 3.6715532861143146e-05, "loss": 2.4996, "step": 1630 }, { "epoch": 0.8090774543660583, "grad_norm": 6.989756107330322, "learning_rate": 3.6633174106407514e-05, "loss": 2.0738, "step": 1640 }, { "epoch": 0.8140108534780464, "grad_norm": 6.769233226776123, "learning_rate": 3.655081535167189e-05, "loss": 2.4975, "step": 1650 }, { "epoch": 0.8189442525900346, "grad_norm": 5.789883136749268, "learning_rate": 3.6468456596936256e-05, "loss": 2.1423, "step": 1660 }, { "epoch": 0.8238776517020227, "grad_norm": 8.308130264282227, "learning_rate": 3.638609784220063e-05, "loss": 2.364, "step": 1670 }, { "epoch": 0.8288110508140109, "grad_norm": 7.824023246765137, "learning_rate": 3.6303739087465e-05, "loss": 2.001, "step": 1680 }, { "epoch": 0.833744449925999, "grad_norm": 6.625056266784668, "learning_rate": 3.622138033272937e-05, "loss": 2.4029, "step": 1690 }, { "epoch": 0.8386778490379871, "grad_norm": 3.973498582839966, "learning_rate": 3.613902157799374e-05, "loss": 2.766, "step": 1700 }, { "epoch": 0.8436112481499753, "grad_norm": 6.130688667297363, "learning_rate": 3.6056662823258114e-05, "loss": 2.4471, "step": 1710 }, { "epoch": 0.8485446472619635, "grad_norm": 4.856011867523193, "learning_rate": 3.597430406852249e-05, "loss": 2.035, "step": 1720 }, { "epoch": 0.8534780463739516, "grad_norm": 5.570549488067627, "learning_rate": 3.5891945313786856e-05, "loss": 1.8751, "step": 1730 }, { "epoch": 0.8584114454859398, "grad_norm": 8.646450996398926, "learning_rate": 3.580958655905123e-05, "loss": 2.0868, "step": 1740 }, { "epoch": 0.8633448445979279, "grad_norm": 5.265438556671143, "learning_rate": 3.57272278043156e-05, "loss": 2.5141, "step": 1750 }, { "epoch": 0.8682782437099161, "grad_norm": 11.451949119567871, "learning_rate": 3.564486904957997e-05, "loss": 2.3301, "step": 1760 }, { "epoch": 0.8732116428219043, "grad_norm": 8.040514945983887, "learning_rate": 3.556251029484434e-05, "loss": 2.4714, "step": 1770 }, { "epoch": 0.8781450419338924, "grad_norm": 6.2226481437683105, "learning_rate": 3.5480151540108715e-05, "loss": 2.667, "step": 1780 }, { "epoch": 0.8830784410458806, "grad_norm": 9.629585266113281, "learning_rate": 3.539779278537308e-05, "loss": 2.6848, "step": 1790 }, { "epoch": 0.8880118401578688, "grad_norm": 8.996950149536133, "learning_rate": 3.5315434030637457e-05, "loss": 2.4991, "step": 1800 }, { "epoch": 0.8929452392698569, "grad_norm": 7.078036785125732, "learning_rate": 3.523307527590183e-05, "loss": 2.5703, "step": 1810 }, { "epoch": 0.8978786383818451, "grad_norm": 6.913575649261475, "learning_rate": 3.51507165211662e-05, "loss": 2.5782, "step": 1820 }, { "epoch": 0.9028120374938332, "grad_norm": 8.875394821166992, "learning_rate": 3.506835776643057e-05, "loss": 2.1843, "step": 1830 }, { "epoch": 0.9077454366058214, "grad_norm": 12.219450950622559, "learning_rate": 3.498599901169494e-05, "loss": 2.7252, "step": 1840 }, { "epoch": 0.9126788357178096, "grad_norm": 8.12746524810791, "learning_rate": 3.490364025695932e-05, "loss": 2.2132, "step": 1850 }, { "epoch": 0.9176122348297977, "grad_norm": 17.30431365966797, "learning_rate": 3.482128150222369e-05, "loss": 2.807, "step": 1860 }, { "epoch": 0.9225456339417859, "grad_norm": 4.950164794921875, "learning_rate": 3.4738922747488064e-05, "loss": 2.2689, "step": 1870 }, { "epoch": 0.9274790330537741, "grad_norm": 6.925759315490723, "learning_rate": 3.465656399275243e-05, "loss": 2.3312, "step": 1880 }, { "epoch": 0.9324124321657622, "grad_norm": 5.608890533447266, "learning_rate": 3.4574205238016806e-05, "loss": 2.305, "step": 1890 }, { "epoch": 0.9373458312777504, "grad_norm": 8.436676025390625, "learning_rate": 3.4491846483281173e-05, "loss": 2.4468, "step": 1900 }, { "epoch": 0.9422792303897385, "grad_norm": 14.684228897094727, "learning_rate": 3.440948772854555e-05, "loss": 2.4759, "step": 1910 }, { "epoch": 0.9472126295017267, "grad_norm": 9.368124961853027, "learning_rate": 3.432712897380992e-05, "loss": 2.5555, "step": 1920 }, { "epoch": 0.9521460286137149, "grad_norm": 8.614909172058105, "learning_rate": 3.424477021907429e-05, "loss": 2.351, "step": 1930 }, { "epoch": 0.957079427725703, "grad_norm": 10.137232780456543, "learning_rate": 3.4162411464338664e-05, "loss": 2.2095, "step": 1940 }, { "epoch": 0.9620128268376912, "grad_norm": 5.5300493240356445, "learning_rate": 3.408005270960303e-05, "loss": 2.2628, "step": 1950 }, { "epoch": 0.9669462259496794, "grad_norm": 5.127216815948486, "learning_rate": 3.3997693954867406e-05, "loss": 1.9502, "step": 1960 }, { "epoch": 0.9718796250616675, "grad_norm": 4.680507659912109, "learning_rate": 3.3915335200131774e-05, "loss": 1.8751, "step": 1970 }, { "epoch": 0.9768130241736557, "grad_norm": 6.784450531005859, "learning_rate": 3.383297644539615e-05, "loss": 2.477, "step": 1980 }, { "epoch": 0.9817464232856438, "grad_norm": 6.171282768249512, "learning_rate": 3.3750617690660516e-05, "loss": 2.4942, "step": 1990 }, { "epoch": 0.986679822397632, "grad_norm": 6.970947265625, "learning_rate": 3.366825893592489e-05, "loss": 2.2718, "step": 2000 }, { "epoch": 0.9916132215096202, "grad_norm": 4.48728084564209, "learning_rate": 3.3585900181189265e-05, "loss": 2.6097, "step": 2010 }, { "epoch": 0.9965466206216083, "grad_norm": 13.015890121459961, "learning_rate": 3.350354142645363e-05, "loss": 2.7752, "step": 2020 }, { "epoch": 1.0014800197335965, "grad_norm": 7.251813888549805, "learning_rate": 3.342118267171801e-05, "loss": 2.0151, "step": 2030 }, { "epoch": 1.0064134188455847, "grad_norm": 5.12528133392334, "learning_rate": 3.3338823916982374e-05, "loss": 2.0101, "step": 2040 }, { "epoch": 1.0113468179575729, "grad_norm": 5.179903507232666, "learning_rate": 3.325646516224675e-05, "loss": 2.0539, "step": 2050 }, { "epoch": 1.0162802170695608, "grad_norm": 13.52995777130127, "learning_rate": 3.3174106407511116e-05, "loss": 1.9679, "step": 2060 }, { "epoch": 1.021213616181549, "grad_norm": 10.211468696594238, "learning_rate": 3.309174765277549e-05, "loss": 2.4251, "step": 2070 }, { "epoch": 1.0261470152935372, "grad_norm": 11.468674659729004, "learning_rate": 3.300938889803986e-05, "loss": 2.026, "step": 2080 }, { "epoch": 1.0310804144055254, "grad_norm": 9.60104751586914, "learning_rate": 3.292703014330423e-05, "loss": 2.0974, "step": 2090 }, { "epoch": 1.0360138135175136, "grad_norm": 16.93134307861328, "learning_rate": 3.284467138856861e-05, "loss": 1.8676, "step": 2100 }, { "epoch": 1.0409472126295016, "grad_norm": 7.654803276062012, "learning_rate": 3.2762312633832975e-05, "loss": 1.6796, "step": 2110 }, { "epoch": 1.0458806117414898, "grad_norm": 7.115079879760742, "learning_rate": 3.267995387909735e-05, "loss": 1.9246, "step": 2120 }, { "epoch": 1.050814010853478, "grad_norm": 5.4859185218811035, "learning_rate": 3.2597595124361724e-05, "loss": 2.0991, "step": 2130 }, { "epoch": 1.0557474099654662, "grad_norm": 16.93896484375, "learning_rate": 3.25152363696261e-05, "loss": 1.87, "step": 2140 }, { "epoch": 1.0606808090774544, "grad_norm": 8.788496017456055, "learning_rate": 3.2432877614890466e-05, "loss": 1.8518, "step": 2150 }, { "epoch": 1.0656142081894426, "grad_norm": 5.605971336364746, "learning_rate": 3.235051886015484e-05, "loss": 1.4863, "step": 2160 }, { "epoch": 1.0705476073014306, "grad_norm": 8.495088577270508, "learning_rate": 3.226816010541921e-05, "loss": 1.7369, "step": 2170 }, { "epoch": 1.0754810064134188, "grad_norm": 9.221963882446289, "learning_rate": 3.218580135068358e-05, "loss": 2.0833, "step": 2180 }, { "epoch": 1.080414405525407, "grad_norm": 5.741699695587158, "learning_rate": 3.210344259594795e-05, "loss": 2.4029, "step": 2190 }, { "epoch": 1.0853478046373952, "grad_norm": 9.234859466552734, "learning_rate": 3.2021083841212324e-05, "loss": 1.5287, "step": 2200 }, { "epoch": 1.0902812037493834, "grad_norm": 7.735116004943848, "learning_rate": 3.19387250864767e-05, "loss": 1.9516, "step": 2210 }, { "epoch": 1.0952146028613714, "grad_norm": 6.103670120239258, "learning_rate": 3.1856366331741066e-05, "loss": 1.9241, "step": 2220 }, { "epoch": 1.1001480019733596, "grad_norm": 5.348116397857666, "learning_rate": 3.177400757700544e-05, "loss": 1.6599, "step": 2230 }, { "epoch": 1.1050814010853478, "grad_norm": 11.653718948364258, "learning_rate": 3.169164882226981e-05, "loss": 1.9352, "step": 2240 }, { "epoch": 1.110014800197336, "grad_norm": 8.342635154724121, "learning_rate": 3.160929006753418e-05, "loss": 1.963, "step": 2250 }, { "epoch": 1.1149481993093242, "grad_norm": 6.9188103675842285, "learning_rate": 3.152693131279855e-05, "loss": 1.9066, "step": 2260 }, { "epoch": 1.1198815984213122, "grad_norm": 7.183344841003418, "learning_rate": 3.1444572558062925e-05, "loss": 1.9223, "step": 2270 }, { "epoch": 1.1248149975333004, "grad_norm": 6.872612476348877, "learning_rate": 3.136221380332729e-05, "loss": 1.5956, "step": 2280 }, { "epoch": 1.1297483966452886, "grad_norm": 7.547828674316406, "learning_rate": 3.127985504859167e-05, "loss": 1.7239, "step": 2290 }, { "epoch": 1.1346817957572768, "grad_norm": 9.748641014099121, "learning_rate": 3.119749629385604e-05, "loss": 1.455, "step": 2300 }, { "epoch": 1.139615194869265, "grad_norm": 13.310020446777344, "learning_rate": 3.111513753912041e-05, "loss": 1.9041, "step": 2310 }, { "epoch": 1.144548593981253, "grad_norm": 8.848477363586426, "learning_rate": 3.103277878438478e-05, "loss": 2.0644, "step": 2320 }, { "epoch": 1.1494819930932412, "grad_norm": 7.570590019226074, "learning_rate": 3.095042002964915e-05, "loss": 2.1607, "step": 2330 }, { "epoch": 1.1544153922052294, "grad_norm": 7.833923816680908, "learning_rate": 3.0868061274913525e-05, "loss": 1.5976, "step": 2340 }, { "epoch": 1.1593487913172176, "grad_norm": 6.807453632354736, "learning_rate": 3.078570252017789e-05, "loss": 1.8354, "step": 2350 }, { "epoch": 1.1642821904292058, "grad_norm": 5.707053184509277, "learning_rate": 3.070334376544227e-05, "loss": 2.0815, "step": 2360 }, { "epoch": 1.169215589541194, "grad_norm": 3.920215606689453, "learning_rate": 3.0620985010706635e-05, "loss": 2.1072, "step": 2370 }, { "epoch": 1.174148988653182, "grad_norm": 8.955766677856445, "learning_rate": 3.053862625597101e-05, "loss": 1.9202, "step": 2380 }, { "epoch": 1.1790823877651702, "grad_norm": 6.6745219230651855, "learning_rate": 3.045626750123538e-05, "loss": 1.9565, "step": 2390 }, { "epoch": 1.1840157868771584, "grad_norm": 5.365530967712402, "learning_rate": 3.037390874649975e-05, "loss": 1.8332, "step": 2400 }, { "epoch": 1.1889491859891466, "grad_norm": 4.238736629486084, "learning_rate": 3.0291549991764122e-05, "loss": 1.6349, "step": 2410 }, { "epoch": 1.1938825851011348, "grad_norm": 4.686330795288086, "learning_rate": 3.02091912370285e-05, "loss": 2.0218, "step": 2420 }, { "epoch": 1.1988159842131227, "grad_norm": 10.073396682739258, "learning_rate": 3.012683248229287e-05, "loss": 2.1104, "step": 2430 }, { "epoch": 1.203749383325111, "grad_norm": 7.226747035980225, "learning_rate": 3.0044473727557242e-05, "loss": 1.7695, "step": 2440 }, { "epoch": 1.2086827824370991, "grad_norm": 6.546811103820801, "learning_rate": 2.9962114972821613e-05, "loss": 1.5873, "step": 2450 }, { "epoch": 1.2136161815490873, "grad_norm": 7.798117637634277, "learning_rate": 2.9879756218085987e-05, "loss": 1.7036, "step": 2460 }, { "epoch": 1.2185495806610755, "grad_norm": 5.489384174346924, "learning_rate": 2.979739746335036e-05, "loss": 1.9574, "step": 2470 }, { "epoch": 1.2234829797730638, "grad_norm": 11.180368423461914, "learning_rate": 2.971503870861473e-05, "loss": 1.7365, "step": 2480 }, { "epoch": 1.2284163788850517, "grad_norm": 6.837683200836182, "learning_rate": 2.96326799538791e-05, "loss": 1.8239, "step": 2490 }, { "epoch": 1.23334977799704, "grad_norm": 6.739687919616699, "learning_rate": 2.955032119914347e-05, "loss": 1.6268, "step": 2500 }, { "epoch": 1.2382831771090281, "grad_norm": 6.837540149688721, "learning_rate": 2.9467962444407842e-05, "loss": 1.9372, "step": 2510 }, { "epoch": 1.2432165762210163, "grad_norm": 12.321346282958984, "learning_rate": 2.9385603689672213e-05, "loss": 1.7772, "step": 2520 }, { "epoch": 1.2481499753330045, "grad_norm": 5.676684379577637, "learning_rate": 2.9303244934936584e-05, "loss": 2.2801, "step": 2530 }, { "epoch": 1.2530833744449925, "grad_norm": 8.29293155670166, "learning_rate": 2.9220886180200955e-05, "loss": 1.9417, "step": 2540 }, { "epoch": 1.2580167735569807, "grad_norm": 10.451891899108887, "learning_rate": 2.913852742546533e-05, "loss": 1.9312, "step": 2550 }, { "epoch": 1.262950172668969, "grad_norm": 6.803496837615967, "learning_rate": 2.90561686707297e-05, "loss": 2.1992, "step": 2560 }, { "epoch": 1.2678835717809571, "grad_norm": 7.187436580657959, "learning_rate": 2.8973809915994072e-05, "loss": 1.9083, "step": 2570 }, { "epoch": 1.2728169708929453, "grad_norm": 8.493294715881348, "learning_rate": 2.8891451161258443e-05, "loss": 1.796, "step": 2580 }, { "epoch": 1.2777503700049335, "grad_norm": 10.225448608398438, "learning_rate": 2.8809092406522814e-05, "loss": 2.2556, "step": 2590 }, { "epoch": 1.2826837691169215, "grad_norm": 5.342253684997559, "learning_rate": 2.8726733651787185e-05, "loss": 1.8728, "step": 2600 }, { "epoch": 1.2876171682289097, "grad_norm": 6.041278839111328, "learning_rate": 2.8644374897051556e-05, "loss": 1.9532, "step": 2610 }, { "epoch": 1.292550567340898, "grad_norm": 6.2567853927612305, "learning_rate": 2.8562016142315927e-05, "loss": 2.0493, "step": 2620 }, { "epoch": 1.297483966452886, "grad_norm": 9.334773063659668, "learning_rate": 2.8479657387580298e-05, "loss": 1.573, "step": 2630 }, { "epoch": 1.302417365564874, "grad_norm": 13.7177734375, "learning_rate": 2.8397298632844672e-05, "loss": 2.2434, "step": 2640 }, { "epoch": 1.3073507646768623, "grad_norm": 11.203829765319824, "learning_rate": 2.8314939878109043e-05, "loss": 2.157, "step": 2650 }, { "epoch": 1.3122841637888505, "grad_norm": 7.4039483070373535, "learning_rate": 2.8232581123373414e-05, "loss": 2.0443, "step": 2660 }, { "epoch": 1.3172175629008387, "grad_norm": 4.927941799163818, "learning_rate": 2.8150222368637785e-05, "loss": 2.3465, "step": 2670 }, { "epoch": 1.322150962012827, "grad_norm": 11.956111907958984, "learning_rate": 2.8067863613902156e-05, "loss": 1.9842, "step": 2680 }, { "epoch": 1.327084361124815, "grad_norm": 5.877150535583496, "learning_rate": 2.7985504859166527e-05, "loss": 2.1889, "step": 2690 }, { "epoch": 1.3320177602368033, "grad_norm": 5.822342395782471, "learning_rate": 2.7903146104430905e-05, "loss": 1.6998, "step": 2700 }, { "epoch": 1.3369511593487913, "grad_norm": 5.760201930999756, "learning_rate": 2.7820787349695276e-05, "loss": 1.7595, "step": 2710 }, { "epoch": 1.3418845584607795, "grad_norm": 4.266313552856445, "learning_rate": 2.7738428594959647e-05, "loss": 1.4794, "step": 2720 }, { "epoch": 1.3468179575727677, "grad_norm": 9.27362060546875, "learning_rate": 2.7656069840224018e-05, "loss": 1.9813, "step": 2730 }, { "epoch": 1.3517513566847559, "grad_norm": 6.628077507019043, "learning_rate": 2.757371108548839e-05, "loss": 1.7057, "step": 2740 }, { "epoch": 1.3566847557967439, "grad_norm": 4.694432735443115, "learning_rate": 2.7491352330752764e-05, "loss": 1.8465, "step": 2750 }, { "epoch": 1.361618154908732, "grad_norm": 6.031017780303955, "learning_rate": 2.7408993576017135e-05, "loss": 1.2289, "step": 2760 }, { "epoch": 1.3665515540207203, "grad_norm": 5.344706058502197, "learning_rate": 2.7326634821281506e-05, "loss": 1.9155, "step": 2770 }, { "epoch": 1.3714849531327085, "grad_norm": 5.284564018249512, "learning_rate": 2.7244276066545877e-05, "loss": 1.7855, "step": 2780 }, { "epoch": 1.3764183522446967, "grad_norm": 4.38336181640625, "learning_rate": 2.7161917311810248e-05, "loss": 2.0217, "step": 2790 }, { "epoch": 1.3813517513566849, "grad_norm": 10.120343208312988, "learning_rate": 2.707955855707462e-05, "loss": 2.19, "step": 2800 }, { "epoch": 1.3862851504686728, "grad_norm": 6.627511024475098, "learning_rate": 2.699719980233899e-05, "loss": 1.9309, "step": 2810 }, { "epoch": 1.391218549580661, "grad_norm": 2.8276286125183105, "learning_rate": 2.691484104760336e-05, "loss": 1.8075, "step": 2820 }, { "epoch": 1.3961519486926492, "grad_norm": 10.60366153717041, "learning_rate": 2.6832482292867732e-05, "loss": 1.4447, "step": 2830 }, { "epoch": 1.4010853478046374, "grad_norm": 5.9414286613464355, "learning_rate": 2.6750123538132106e-05, "loss": 1.8544, "step": 2840 }, { "epoch": 1.4060187469166254, "grad_norm": 5.864882946014404, "learning_rate": 2.6667764783396477e-05, "loss": 2.1942, "step": 2850 }, { "epoch": 1.4109521460286136, "grad_norm": 5.399135112762451, "learning_rate": 2.6585406028660848e-05, "loss": 1.9905, "step": 2860 }, { "epoch": 1.4158855451406018, "grad_norm": 4.467281341552734, "learning_rate": 2.650304727392522e-05, "loss": 1.7024, "step": 2870 }, { "epoch": 1.42081894425259, "grad_norm": 7.034926414489746, "learning_rate": 2.642068851918959e-05, "loss": 1.8154, "step": 2880 }, { "epoch": 1.4257523433645782, "grad_norm": 4.346402168273926, "learning_rate": 2.633832976445396e-05, "loss": 1.7937, "step": 2890 }, { "epoch": 1.4306857424765664, "grad_norm": 9.743393898010254, "learning_rate": 2.6255971009718332e-05, "loss": 1.7868, "step": 2900 }, { "epoch": 1.4356191415885546, "grad_norm": 7.675935745239258, "learning_rate": 2.6173612254982703e-05, "loss": 1.8108, "step": 2910 }, { "epoch": 1.4405525407005426, "grad_norm": 5.96721076965332, "learning_rate": 2.6091253500247074e-05, "loss": 1.8259, "step": 2920 }, { "epoch": 1.4454859398125308, "grad_norm": 6.192174911499023, "learning_rate": 2.600889474551145e-05, "loss": 1.7254, "step": 2930 }, { "epoch": 1.450419338924519, "grad_norm": 7.611389636993408, "learning_rate": 2.592653599077582e-05, "loss": 2.1246, "step": 2940 }, { "epoch": 1.4553527380365072, "grad_norm": 8.755757331848145, "learning_rate": 2.584417723604019e-05, "loss": 1.2897, "step": 2950 }, { "epoch": 1.4602861371484952, "grad_norm": 17.625520706176758, "learning_rate": 2.5761818481304562e-05, "loss": 1.8039, "step": 2960 }, { "epoch": 1.4652195362604834, "grad_norm": 6.4828057289123535, "learning_rate": 2.5679459726568933e-05, "loss": 1.7734, "step": 2970 }, { "epoch": 1.4701529353724716, "grad_norm": 4.072720527648926, "learning_rate": 2.5597100971833304e-05, "loss": 2.1754, "step": 2980 }, { "epoch": 1.4750863344844598, "grad_norm": 5.953226089477539, "learning_rate": 2.551474221709768e-05, "loss": 1.7654, "step": 2990 }, { "epoch": 1.480019733596448, "grad_norm": 5.490500450134277, "learning_rate": 2.5432383462362053e-05, "loss": 1.9371, "step": 3000 }, { "epoch": 1.4849531327084362, "grad_norm": 5.547089576721191, "learning_rate": 2.5350024707626424e-05, "loss": 1.8543, "step": 3010 }, { "epoch": 1.4898865318204244, "grad_norm": 7.270637512207031, "learning_rate": 2.5267665952890795e-05, "loss": 1.6728, "step": 3020 }, { "epoch": 1.4948199309324124, "grad_norm": 10.142009735107422, "learning_rate": 2.5185307198155166e-05, "loss": 1.7656, "step": 3030 }, { "epoch": 1.4997533300444006, "grad_norm": 11.43400764465332, "learning_rate": 2.510294844341954e-05, "loss": 1.7731, "step": 3040 }, { "epoch": 1.5046867291563888, "grad_norm": 8.225043296813965, "learning_rate": 2.502058968868391e-05, "loss": 2.0266, "step": 3050 }, { "epoch": 1.5096201282683768, "grad_norm": 8.20263385772705, "learning_rate": 2.4938230933948282e-05, "loss": 1.9749, "step": 3060 }, { "epoch": 1.514553527380365, "grad_norm": 8.855592727661133, "learning_rate": 2.4855872179212653e-05, "loss": 1.9333, "step": 3070 }, { "epoch": 1.5194869264923532, "grad_norm": 5.7763752937316895, "learning_rate": 2.4773513424477024e-05, "loss": 1.5184, "step": 3080 }, { "epoch": 1.5244203256043414, "grad_norm": 8.338578224182129, "learning_rate": 2.4691154669741395e-05, "loss": 1.9215, "step": 3090 }, { "epoch": 1.5293537247163296, "grad_norm": 5.367077350616455, "learning_rate": 2.4608795915005766e-05, "loss": 1.973, "step": 3100 }, { "epoch": 1.5342871238283178, "grad_norm": 9.6682710647583, "learning_rate": 2.4526437160270137e-05, "loss": 1.6508, "step": 3110 }, { "epoch": 1.539220522940306, "grad_norm": 5.364202499389648, "learning_rate": 2.4444078405534508e-05, "loss": 1.9087, "step": 3120 }, { "epoch": 1.5441539220522942, "grad_norm": 9.031702995300293, "learning_rate": 2.4361719650798882e-05, "loss": 1.759, "step": 3130 }, { "epoch": 1.5490873211642822, "grad_norm": 6.895035743713379, "learning_rate": 2.4279360896063253e-05, "loss": 1.7942, "step": 3140 }, { "epoch": 1.5540207202762704, "grad_norm": 7.573614120483398, "learning_rate": 2.4197002141327624e-05, "loss": 2.0408, "step": 3150 }, { "epoch": 1.5589541193882586, "grad_norm": 14.733696937561035, "learning_rate": 2.4114643386591996e-05, "loss": 1.748, "step": 3160 }, { "epoch": 1.5638875185002465, "grad_norm": 7.862844944000244, "learning_rate": 2.4032284631856367e-05, "loss": 1.8507, "step": 3170 }, { "epoch": 1.5688209176122347, "grad_norm": 4.0979390144348145, "learning_rate": 2.3949925877120738e-05, "loss": 2.0245, "step": 3180 }, { "epoch": 1.573754316724223, "grad_norm": 6.260437965393066, "learning_rate": 2.386756712238511e-05, "loss": 1.7502, "step": 3190 }, { "epoch": 1.5786877158362111, "grad_norm": 8.188227653503418, "learning_rate": 2.3785208367649483e-05, "loss": 1.959, "step": 3200 }, { "epoch": 1.5836211149481993, "grad_norm": 5.43758487701416, "learning_rate": 2.3702849612913854e-05, "loss": 1.7763, "step": 3210 }, { "epoch": 1.5885545140601876, "grad_norm": 5.603811740875244, "learning_rate": 2.3620490858178225e-05, "loss": 1.7247, "step": 3220 }, { "epoch": 1.5934879131721758, "grad_norm": 4.974462509155273, "learning_rate": 2.35381321034426e-05, "loss": 1.8819, "step": 3230 }, { "epoch": 1.598421312284164, "grad_norm": 5.835883617401123, "learning_rate": 2.345577334870697e-05, "loss": 1.9156, "step": 3240 }, { "epoch": 1.603354711396152, "grad_norm": 10.42984676361084, "learning_rate": 2.337341459397134e-05, "loss": 1.6488, "step": 3250 }, { "epoch": 1.6082881105081401, "grad_norm": 10.849640846252441, "learning_rate": 2.3291055839235712e-05, "loss": 1.9802, "step": 3260 }, { "epoch": 1.6132215096201281, "grad_norm": 6.4310784339904785, "learning_rate": 2.3208697084500083e-05, "loss": 1.9685, "step": 3270 }, { "epoch": 1.6181549087321163, "grad_norm": 4.533667087554932, "learning_rate": 2.3126338329764454e-05, "loss": 1.5295, "step": 3280 }, { "epoch": 1.6230883078441045, "grad_norm": 7.374139785766602, "learning_rate": 2.3043979575028825e-05, "loss": 1.8251, "step": 3290 }, { "epoch": 1.6280217069560927, "grad_norm": 7.097417831420898, "learning_rate": 2.2961620820293196e-05, "loss": 1.7943, "step": 3300 }, { "epoch": 1.632955106068081, "grad_norm": 9.811117172241211, "learning_rate": 2.2879262065557567e-05, "loss": 1.5591, "step": 3310 }, { "epoch": 1.6378885051800691, "grad_norm": 2.922159194946289, "learning_rate": 2.2796903310821942e-05, "loss": 1.5074, "step": 3320 }, { "epoch": 1.6428219042920573, "grad_norm": 5.961956977844238, "learning_rate": 2.2714544556086313e-05, "loss": 2.1771, "step": 3330 }, { "epoch": 1.6477553034040455, "grad_norm": 6.768845558166504, "learning_rate": 2.2632185801350687e-05, "loss": 1.7893, "step": 3340 }, { "epoch": 1.6526887025160335, "grad_norm": 5.665631294250488, "learning_rate": 2.2549827046615058e-05, "loss": 1.8398, "step": 3350 }, { "epoch": 1.6576221016280217, "grad_norm": 8.640176773071289, "learning_rate": 2.246746829187943e-05, "loss": 1.9559, "step": 3360 }, { "epoch": 1.66255550074001, "grad_norm": 6.773179531097412, "learning_rate": 2.23851095371438e-05, "loss": 1.7461, "step": 3370 }, { "epoch": 1.6674888998519979, "grad_norm": 5.100921630859375, "learning_rate": 2.230275078240817e-05, "loss": 1.9243, "step": 3380 }, { "epoch": 1.672422298963986, "grad_norm": 8.315923690795898, "learning_rate": 2.2220392027672542e-05, "loss": 1.8641, "step": 3390 }, { "epoch": 1.6773556980759743, "grad_norm": 6.231566429138184, "learning_rate": 2.2138033272936913e-05, "loss": 1.9068, "step": 3400 }, { "epoch": 1.6822890971879625, "grad_norm": 4.701402187347412, "learning_rate": 2.2055674518201284e-05, "loss": 1.9061, "step": 3410 }, { "epoch": 1.6872224962999507, "grad_norm": 5.9488325119018555, "learning_rate": 2.197331576346566e-05, "loss": 2.0304, "step": 3420 }, { "epoch": 1.692155895411939, "grad_norm": 9.257286071777344, "learning_rate": 2.189095700873003e-05, "loss": 1.8544, "step": 3430 }, { "epoch": 1.697089294523927, "grad_norm": 6.57626485824585, "learning_rate": 2.18085982539944e-05, "loss": 1.9199, "step": 3440 }, { "epoch": 1.7020226936359153, "grad_norm": 6.381181240081787, "learning_rate": 2.1726239499258772e-05, "loss": 2.0876, "step": 3450 }, { "epoch": 1.7069560927479033, "grad_norm": 3.832770586013794, "learning_rate": 2.1643880744523143e-05, "loss": 1.7948, "step": 3460 }, { "epoch": 1.7118894918598915, "grad_norm": 20.409229278564453, "learning_rate": 2.1561521989787514e-05, "loss": 1.9628, "step": 3470 }, { "epoch": 1.7168228909718797, "grad_norm": 12.321199417114258, "learning_rate": 2.1479163235051885e-05, "loss": 2.149, "step": 3480 }, { "epoch": 1.7217562900838677, "grad_norm": 7.48840856552124, "learning_rate": 2.139680448031626e-05, "loss": 1.6974, "step": 3490 }, { "epoch": 1.7266896891958559, "grad_norm": 10.339478492736816, "learning_rate": 2.131444572558063e-05, "loss": 2.0896, "step": 3500 }, { "epoch": 1.731623088307844, "grad_norm": 8.04315185546875, "learning_rate": 2.1232086970845e-05, "loss": 1.6795, "step": 3510 }, { "epoch": 1.7365564874198323, "grad_norm": 9.33163070678711, "learning_rate": 2.1149728216109376e-05, "loss": 1.7942, "step": 3520 }, { "epoch": 1.7414898865318205, "grad_norm": 11.18399715423584, "learning_rate": 2.1067369461373747e-05, "loss": 1.4243, "step": 3530 }, { "epoch": 1.7464232856438087, "grad_norm": 9.912873268127441, "learning_rate": 2.0985010706638118e-05, "loss": 1.5473, "step": 3540 }, { "epoch": 1.7513566847557969, "grad_norm": 8.763083457946777, "learning_rate": 2.090265195190249e-05, "loss": 1.8, "step": 3550 }, { "epoch": 1.756290083867785, "grad_norm": 8.042789459228516, "learning_rate": 2.082029319716686e-05, "loss": 1.9486, "step": 3560 }, { "epoch": 1.761223482979773, "grad_norm": 5.028835296630859, "learning_rate": 2.073793444243123e-05, "loss": 1.8425, "step": 3570 }, { "epoch": 1.7661568820917612, "grad_norm": 5.263244152069092, "learning_rate": 2.0655575687695602e-05, "loss": 2.3425, "step": 3580 }, { "epoch": 1.7710902812037492, "grad_norm": 6.808823585510254, "learning_rate": 2.0573216932959973e-05, "loss": 1.7344, "step": 3590 }, { "epoch": 1.7760236803157374, "grad_norm": 7.849328994750977, "learning_rate": 2.0490858178224344e-05, "loss": 1.9272, "step": 3600 }, { "epoch": 1.7809570794277256, "grad_norm": 5.156803131103516, "learning_rate": 2.0408499423488718e-05, "loss": 1.9647, "step": 3610 }, { "epoch": 1.7858904785397138, "grad_norm": 8.30761432647705, "learning_rate": 2.032614066875309e-05, "loss": 1.5823, "step": 3620 }, { "epoch": 1.790823877651702, "grad_norm": 9.06478214263916, "learning_rate": 2.0243781914017464e-05, "loss": 2.03, "step": 3630 }, { "epoch": 1.7957572767636902, "grad_norm": 13.680024147033691, "learning_rate": 2.0161423159281835e-05, "loss": 1.9991, "step": 3640 }, { "epoch": 1.8006906758756784, "grad_norm": 6.494818210601807, "learning_rate": 2.0079064404546206e-05, "loss": 1.8554, "step": 3650 }, { "epoch": 1.8056240749876666, "grad_norm": 7.450560092926025, "learning_rate": 1.9996705649810577e-05, "loss": 1.9436, "step": 3660 }, { "epoch": 1.8105574740996546, "grad_norm": 6.066117763519287, "learning_rate": 1.9914346895074948e-05, "loss": 1.7343, "step": 3670 }, { "epoch": 1.8154908732116428, "grad_norm": 4.272149562835693, "learning_rate": 1.983198814033932e-05, "loss": 1.8145, "step": 3680 }, { "epoch": 1.820424272323631, "grad_norm": 7.599696636199951, "learning_rate": 1.974962938560369e-05, "loss": 1.9128, "step": 3690 }, { "epoch": 1.825357671435619, "grad_norm": 8.375687599182129, "learning_rate": 1.966727063086806e-05, "loss": 1.568, "step": 3700 }, { "epoch": 1.8302910705476072, "grad_norm": 12.533012390136719, "learning_rate": 1.9584911876132435e-05, "loss": 1.7363, "step": 3710 }, { "epoch": 1.8352244696595954, "grad_norm": 7.711116313934326, "learning_rate": 1.9502553121396806e-05, "loss": 1.9017, "step": 3720 }, { "epoch": 1.8401578687715836, "grad_norm": 4.572812080383301, "learning_rate": 1.9420194366661177e-05, "loss": 1.5797, "step": 3730 }, { "epoch": 1.8450912678835718, "grad_norm": 4.474880695343018, "learning_rate": 1.9337835611925548e-05, "loss": 2.0278, "step": 3740 }, { "epoch": 1.85002466699556, "grad_norm": 5.497013092041016, "learning_rate": 1.925547685718992e-05, "loss": 1.757, "step": 3750 }, { "epoch": 1.8549580661075482, "grad_norm": 8.392544746398926, "learning_rate": 1.917311810245429e-05, "loss": 1.6158, "step": 3760 }, { "epoch": 1.8598914652195364, "grad_norm": 8.011329650878906, "learning_rate": 1.9090759347718665e-05, "loss": 1.8829, "step": 3770 }, { "epoch": 1.8648248643315244, "grad_norm": 6.119569778442383, "learning_rate": 1.9008400592983036e-05, "loss": 2.5915, "step": 3780 }, { "epoch": 1.8697582634435126, "grad_norm": 4.356236934661865, "learning_rate": 1.8926041838247407e-05, "loss": 1.7601, "step": 3790 }, { "epoch": 1.8746916625555008, "grad_norm": 3.654446601867676, "learning_rate": 1.8843683083511778e-05, "loss": 1.5618, "step": 3800 }, { "epoch": 1.8796250616674888, "grad_norm": 10.40364933013916, "learning_rate": 1.8761324328776152e-05, "loss": 2.1173, "step": 3810 }, { "epoch": 1.884558460779477, "grad_norm": 11.642656326293945, "learning_rate": 1.8678965574040523e-05, "loss": 1.9532, "step": 3820 }, { "epoch": 1.8894918598914652, "grad_norm": 7.247965335845947, "learning_rate": 1.8596606819304894e-05, "loss": 1.8573, "step": 3830 }, { "epoch": 1.8944252590034534, "grad_norm": 8.925628662109375, "learning_rate": 1.8514248064569265e-05, "loss": 1.6098, "step": 3840 }, { "epoch": 1.8993586581154416, "grad_norm": 6.515012264251709, "learning_rate": 1.8431889309833636e-05, "loss": 1.7684, "step": 3850 }, { "epoch": 1.9042920572274298, "grad_norm": 9.089420318603516, "learning_rate": 1.8349530555098007e-05, "loss": 2.1597, "step": 3860 }, { "epoch": 1.909225456339418, "grad_norm": 6.92453670501709, "learning_rate": 1.8267171800362378e-05, "loss": 1.8394, "step": 3870 }, { "epoch": 1.9141588554514062, "grad_norm": 7.409922122955322, "learning_rate": 1.818481304562675e-05, "loss": 1.8576, "step": 3880 }, { "epoch": 1.9190922545633942, "grad_norm": 5.864316463470459, "learning_rate": 1.810245429089112e-05, "loss": 1.7566, "step": 3890 }, { "epoch": 1.9240256536753824, "grad_norm": 7.16778039932251, "learning_rate": 1.8020095536155494e-05, "loss": 1.7765, "step": 3900 }, { "epoch": 1.9289590527873703, "grad_norm": 7.447871208190918, "learning_rate": 1.7937736781419865e-05, "loss": 1.5466, "step": 3910 }, { "epoch": 1.9338924518993585, "grad_norm": 8.456513404846191, "learning_rate": 1.785537802668424e-05, "loss": 1.6053, "step": 3920 }, { "epoch": 1.9388258510113467, "grad_norm": 5.673243045806885, "learning_rate": 1.777301927194861e-05, "loss": 1.6813, "step": 3930 }, { "epoch": 1.943759250123335, "grad_norm": 13.05078125, "learning_rate": 1.7690660517212982e-05, "loss": 1.8932, "step": 3940 }, { "epoch": 1.9486926492353231, "grad_norm": 5.095945358276367, "learning_rate": 1.7608301762477353e-05, "loss": 1.9467, "step": 3950 }, { "epoch": 1.9536260483473114, "grad_norm": 7.766141414642334, "learning_rate": 1.7525943007741724e-05, "loss": 1.9318, "step": 3960 }, { "epoch": 1.9585594474592996, "grad_norm": 6.2664690017700195, "learning_rate": 1.7443584253006095e-05, "loss": 1.5792, "step": 3970 }, { "epoch": 1.9634928465712878, "grad_norm": 9.3766508102417, "learning_rate": 1.7361225498270466e-05, "loss": 1.7359, "step": 3980 }, { "epoch": 1.9684262456832757, "grad_norm": 8.860836029052734, "learning_rate": 1.7278866743534837e-05, "loss": 1.7544, "step": 3990 }, { "epoch": 1.973359644795264, "grad_norm": 6.054210662841797, "learning_rate": 1.719650798879921e-05, "loss": 1.5618, "step": 4000 }, { "epoch": 1.9782930439072521, "grad_norm": 11.120077133178711, "learning_rate": 1.7114149234063582e-05, "loss": 1.8174, "step": 4010 }, { "epoch": 1.9832264430192401, "grad_norm": 6.720516681671143, "learning_rate": 1.7031790479327953e-05, "loss": 1.613, "step": 4020 }, { "epoch": 1.9881598421312283, "grad_norm": 6.063734531402588, "learning_rate": 1.6949431724592324e-05, "loss": 1.9363, "step": 4030 }, { "epoch": 1.9930932412432165, "grad_norm": 5.5154523849487305, "learning_rate": 1.6867072969856695e-05, "loss": 1.6523, "step": 4040 }, { "epoch": 1.9980266403552047, "grad_norm": 10.039024353027344, "learning_rate": 1.6784714215121066e-05, "loss": 1.8938, "step": 4050 } ], "logging_steps": 10, "max_steps": 6081, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5059810462466048e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }