diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,46955 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 5864, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013655809586805073, + "grad_norm": 3.3650412324693257, + "learning_rate": 0.0, + "loss": 0.5878, + "num_tokens": 781862.0, + "step": 1 + }, + { + "epoch": 0.0027311619173610146, + "grad_norm": 3.3569112201740188, + "learning_rate": 1.1363636363636364e-07, + "loss": 0.6167, + "num_tokens": 1579241.0, + "step": 2 + }, + { + "epoch": 0.004096742876041522, + "grad_norm": 3.41290845097348, + "learning_rate": 2.2727272727272729e-07, + "loss": 0.5823, + "num_tokens": 2313777.0, + "step": 3 + }, + { + "epoch": 0.005462323834722029, + "grad_norm": 3.4842079496751452, + "learning_rate": 3.409090909090909e-07, + "loss": 0.5846, + "num_tokens": 2958332.0, + "step": 4 + }, + { + "epoch": 0.006827904793402537, + "grad_norm": 3.306985245054534, + "learning_rate": 4.5454545454545457e-07, + "loss": 0.5782, + "num_tokens": 3751162.0, + "step": 5 + }, + { + "epoch": 0.008193485752083044, + "grad_norm": 3.2545411749396895, + "learning_rate": 5.681818181818182e-07, + "loss": 0.5903, + "num_tokens": 4522410.0, + "step": 6 + }, + { + "epoch": 0.009559066710763552, + "grad_norm": 3.3932860307909385, + "learning_rate": 6.818181818181818e-07, + "loss": 0.6029, + "num_tokens": 5246142.0, + "step": 7 + }, + { + "epoch": 0.010924647669444058, + "grad_norm": 3.196434844976779, + "learning_rate": 7.954545454545455e-07, + "loss": 0.6017, + "num_tokens": 6047445.0, + "step": 8 + }, + { + "epoch": 0.012290228628124567, + "grad_norm": 3.035147166240426, + "learning_rate": 9.090909090909091e-07, + "loss": 0.5895, + "num_tokens": 6880372.0, + "step": 9 + }, + { + "epoch": 0.013655809586805075, + "grad_norm": 2.843003416139641, + "learning_rate": 1.0227272727272729e-06, + "loss": 0.5795, + "num_tokens": 7653932.0, + "step": 10 + }, + { + "epoch": 0.015021390545485581, + "grad_norm": 2.7943692106453333, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.5849, + "num_tokens": 8367645.0, + "step": 11 + }, + { + "epoch": 0.016386971504166088, + "grad_norm": 2.7811302763362304, + "learning_rate": 1.25e-06, + "loss": 0.5833, + "num_tokens": 9111974.0, + "step": 12 + }, + { + "epoch": 0.017752552462846596, + "grad_norm": 2.5984282822955747, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.5705, + "num_tokens": 9866272.0, + "step": 13 + }, + { + "epoch": 0.019118133421527104, + "grad_norm": 2.5507012084013874, + "learning_rate": 1.4772727272727275e-06, + "loss": 0.569, + "num_tokens": 10557085.0, + "step": 14 + }, + { + "epoch": 0.020483714380207612, + "grad_norm": 2.4506414132525, + "learning_rate": 1.590909090909091e-06, + "loss": 0.5447, + "num_tokens": 11350918.0, + "step": 15 + }, + { + "epoch": 0.021849295338888117, + "grad_norm": 2.5816515429808624, + "learning_rate": 1.7045454545454546e-06, + "loss": 0.5798, + "num_tokens": 11999096.0, + "step": 16 + }, + { + "epoch": 0.023214876297568625, + "grad_norm": 2.2835123474475085, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.5773, + "num_tokens": 12834086.0, + "step": 17 + }, + { + "epoch": 0.024580457256249133, + "grad_norm": 1.8632307461118707, + "learning_rate": 1.931818181818182e-06, + "loss": 0.5476, + "num_tokens": 13567107.0, + "step": 18 + }, + { + "epoch": 0.02594603821492964, + "grad_norm": 1.5552707560509897, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.5386, + "num_tokens": 14306544.0, + "step": 19 + }, + { + "epoch": 0.02731161917361015, + "grad_norm": 1.644543794826326, + "learning_rate": 2.1590909090909092e-06, + "loss": 0.5354, + "num_tokens": 15097194.0, + "step": 20 + }, + { + "epoch": 0.028677200132290654, + "grad_norm": 1.8247718212508763, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.5614, + "num_tokens": 15881981.0, + "step": 21 + }, + { + "epoch": 0.030042781090971162, + "grad_norm": 1.6412491816592796, + "learning_rate": 2.3863636363636367e-06, + "loss": 0.5362, + "num_tokens": 16690050.0, + "step": 22 + }, + { + "epoch": 0.03140836204965167, + "grad_norm": 1.454364307832489, + "learning_rate": 2.5e-06, + "loss": 0.5293, + "num_tokens": 17379495.0, + "step": 23 + }, + { + "epoch": 0.032773943008332175, + "grad_norm": 1.2788991989632146, + "learning_rate": 2.6136363636363637e-06, + "loss": 0.5364, + "num_tokens": 18147974.0, + "step": 24 + }, + { + "epoch": 0.03413952396701268, + "grad_norm": 0.9842957385383875, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.4996, + "num_tokens": 18882258.0, + "step": 25 + }, + { + "epoch": 0.03550510492569319, + "grad_norm": 1.0863291341078234, + "learning_rate": 2.8409090909090916e-06, + "loss": 0.5123, + "num_tokens": 19635638.0, + "step": 26 + }, + { + "epoch": 0.0368706858843737, + "grad_norm": 1.0832881567839654, + "learning_rate": 2.954545454545455e-06, + "loss": 0.4951, + "num_tokens": 20425283.0, + "step": 27 + }, + { + "epoch": 0.03823626684305421, + "grad_norm": 1.0430380267995263, + "learning_rate": 3.0681818181818186e-06, + "loss": 0.4877, + "num_tokens": 21143674.0, + "step": 28 + }, + { + "epoch": 0.039601847801734716, + "grad_norm": 0.9411792061633784, + "learning_rate": 3.181818181818182e-06, + "loss": 0.4962, + "num_tokens": 21875193.0, + "step": 29 + }, + { + "epoch": 0.040967428760415224, + "grad_norm": 0.9226867197014109, + "learning_rate": 3.2954545454545456e-06, + "loss": 0.5071, + "num_tokens": 22612249.0, + "step": 30 + }, + { + "epoch": 0.04233300971909573, + "grad_norm": 0.763361721959445, + "learning_rate": 3.409090909090909e-06, + "loss": 0.4786, + "num_tokens": 23359661.0, + "step": 31 + }, + { + "epoch": 0.04369859067777623, + "grad_norm": 0.8183401383069621, + "learning_rate": 3.522727272727273e-06, + "loss": 0.5099, + "num_tokens": 24201458.0, + "step": 32 + }, + { + "epoch": 0.04506417163645674, + "grad_norm": 0.7674655250701875, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.4873, + "num_tokens": 24973015.0, + "step": 33 + }, + { + "epoch": 0.04642975259513725, + "grad_norm": 0.7844309274110001, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.491, + "num_tokens": 25790103.0, + "step": 34 + }, + { + "epoch": 0.04779533355381776, + "grad_norm": 0.8652269151802866, + "learning_rate": 3.863636363636364e-06, + "loss": 0.4748, + "num_tokens": 26605078.0, + "step": 35 + }, + { + "epoch": 0.049160914512498266, + "grad_norm": 0.8500735677733969, + "learning_rate": 3.9772727272727275e-06, + "loss": 0.4538, + "num_tokens": 27311426.0, + "step": 36 + }, + { + "epoch": 0.050526495471178774, + "grad_norm": 0.7676854129895507, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.456, + "num_tokens": 28028209.0, + "step": 37 + }, + { + "epoch": 0.05189207642985928, + "grad_norm": 0.6904805048255496, + "learning_rate": 4.204545454545455e-06, + "loss": 0.4645, + "num_tokens": 28813214.0, + "step": 38 + }, + { + "epoch": 0.05325765738853979, + "grad_norm": 0.674443339309719, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.4597, + "num_tokens": 29649897.0, + "step": 39 + }, + { + "epoch": 0.0546232383472203, + "grad_norm": 0.6754724402818755, + "learning_rate": 4.4318181818181824e-06, + "loss": 0.4776, + "num_tokens": 30381412.0, + "step": 40 + }, + { + "epoch": 0.0559888193059008, + "grad_norm": 0.6489534214594035, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.4611, + "num_tokens": 31138037.0, + "step": 41 + }, + { + "epoch": 0.05735440026458131, + "grad_norm": 0.5478698172314034, + "learning_rate": 4.6590909090909095e-06, + "loss": 0.4509, + "num_tokens": 31904977.0, + "step": 42 + }, + { + "epoch": 0.058719981223261816, + "grad_norm": 0.4412777204971197, + "learning_rate": 4.772727272727273e-06, + "loss": 0.4757, + "num_tokens": 32705553.0, + "step": 43 + }, + { + "epoch": 0.060085562181942324, + "grad_norm": 0.37314697259810586, + "learning_rate": 4.8863636363636365e-06, + "loss": 0.4645, + "num_tokens": 33448701.0, + "step": 44 + }, + { + "epoch": 0.06145114314062283, + "grad_norm": 0.4397963511117393, + "learning_rate": 5e-06, + "loss": 0.4453, + "num_tokens": 34188110.0, + "step": 45 + }, + { + "epoch": 0.06281672409930333, + "grad_norm": 0.4066256038354766, + "learning_rate": 5.113636363636364e-06, + "loss": 0.4544, + "num_tokens": 34917083.0, + "step": 46 + }, + { + "epoch": 0.06418230505798385, + "grad_norm": 0.38878423635846254, + "learning_rate": 5.2272727272727274e-06, + "loss": 0.4579, + "num_tokens": 35648165.0, + "step": 47 + }, + { + "epoch": 0.06554788601666435, + "grad_norm": 0.3934226660553369, + "learning_rate": 5.340909090909091e-06, + "loss": 0.454, + "num_tokens": 36382540.0, + "step": 48 + }, + { + "epoch": 0.06691346697534487, + "grad_norm": 0.35334436807302505, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.4767, + "num_tokens": 37185624.0, + "step": 49 + }, + { + "epoch": 0.06827904793402537, + "grad_norm": 0.35676201510597416, + "learning_rate": 5.568181818181818e-06, + "loss": 0.4321, + "num_tokens": 37911531.0, + "step": 50 + }, + { + "epoch": 0.06964462889270588, + "grad_norm": 0.424957874194271, + "learning_rate": 5.681818181818183e-06, + "loss": 0.4723, + "num_tokens": 38686844.0, + "step": 51 + }, + { + "epoch": 0.07101020985138638, + "grad_norm": 0.35470760532123313, + "learning_rate": 5.795454545454546e-06, + "loss": 0.4352, + "num_tokens": 39376780.0, + "step": 52 + }, + { + "epoch": 0.0723757908100669, + "grad_norm": 0.34097398547737073, + "learning_rate": 5.90909090909091e-06, + "loss": 0.4567, + "num_tokens": 40177770.0, + "step": 53 + }, + { + "epoch": 0.0737413717687474, + "grad_norm": 0.3656613446075414, + "learning_rate": 6.022727272727273e-06, + "loss": 0.4393, + "num_tokens": 40922233.0, + "step": 54 + }, + { + "epoch": 0.0751069527274279, + "grad_norm": 0.4177074254300217, + "learning_rate": 6.136363636363637e-06, + "loss": 0.461, + "num_tokens": 41776457.0, + "step": 55 + }, + { + "epoch": 0.07647253368610842, + "grad_norm": 0.3837454441911233, + "learning_rate": 6.25e-06, + "loss": 0.441, + "num_tokens": 42492497.0, + "step": 56 + }, + { + "epoch": 0.07783811464478892, + "grad_norm": 0.39451332321583593, + "learning_rate": 6.363636363636364e-06, + "loss": 0.4398, + "num_tokens": 43228499.0, + "step": 57 + }, + { + "epoch": 0.07920369560346943, + "grad_norm": 0.3996252633332505, + "learning_rate": 6.477272727272727e-06, + "loss": 0.4595, + "num_tokens": 43953622.0, + "step": 58 + }, + { + "epoch": 0.08056927656214993, + "grad_norm": 0.38498824581477703, + "learning_rate": 6.590909090909091e-06, + "loss": 0.444, + "num_tokens": 44703923.0, + "step": 59 + }, + { + "epoch": 0.08193485752083045, + "grad_norm": 0.3278378106882008, + "learning_rate": 6.704545454545454e-06, + "loss": 0.4363, + "num_tokens": 45481633.0, + "step": 60 + }, + { + "epoch": 0.08330043847951095, + "grad_norm": 0.3637884232990754, + "learning_rate": 6.818181818181818e-06, + "loss": 0.4466, + "num_tokens": 46179700.0, + "step": 61 + }, + { + "epoch": 0.08466601943819146, + "grad_norm": 0.3309280983512684, + "learning_rate": 6.931818181818183e-06, + "loss": 0.4367, + "num_tokens": 46989012.0, + "step": 62 + }, + { + "epoch": 0.08603160039687197, + "grad_norm": 0.33422459706305213, + "learning_rate": 7.045454545454546e-06, + "loss": 0.4571, + "num_tokens": 47792009.0, + "step": 63 + }, + { + "epoch": 0.08739718135555247, + "grad_norm": 0.3440576954239099, + "learning_rate": 7.15909090909091e-06, + "loss": 0.434, + "num_tokens": 48535837.0, + "step": 64 + }, + { + "epoch": 0.08876276231423298, + "grad_norm": 0.3248374753617506, + "learning_rate": 7.272727272727273e-06, + "loss": 0.4393, + "num_tokens": 49300416.0, + "step": 65 + }, + { + "epoch": 0.09012834327291348, + "grad_norm": 0.32380014393623346, + "learning_rate": 7.386363636363637e-06, + "loss": 0.4405, + "num_tokens": 50027704.0, + "step": 66 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.3297272175543842, + "learning_rate": 7.500000000000001e-06, + "loss": 0.4242, + "num_tokens": 50738298.0, + "step": 67 + }, + { + "epoch": 0.0928595051902745, + "grad_norm": 0.36000657652843704, + "learning_rate": 7.613636363636364e-06, + "loss": 0.4433, + "num_tokens": 51518049.0, + "step": 68 + }, + { + "epoch": 0.09422508614895501, + "grad_norm": 0.32613650163068914, + "learning_rate": 7.727272727272727e-06, + "loss": 0.4264, + "num_tokens": 52202527.0, + "step": 69 + }, + { + "epoch": 0.09559066710763552, + "grad_norm": 0.3433725924095902, + "learning_rate": 7.840909090909091e-06, + "loss": 0.4448, + "num_tokens": 52936090.0, + "step": 70 + }, + { + "epoch": 0.09695624806631603, + "grad_norm": 0.3251222538582668, + "learning_rate": 7.954545454545455e-06, + "loss": 0.4232, + "num_tokens": 53700006.0, + "step": 71 + }, + { + "epoch": 0.09832182902499653, + "grad_norm": 0.3658669972345525, + "learning_rate": 8.068181818181819e-06, + "loss": 0.4288, + "num_tokens": 54469611.0, + "step": 72 + }, + { + "epoch": 0.09968740998367703, + "grad_norm": 0.3232970419517625, + "learning_rate": 8.181818181818183e-06, + "loss": 0.4197, + "num_tokens": 55233428.0, + "step": 73 + }, + { + "epoch": 0.10105299094235755, + "grad_norm": 0.3597752744038255, + "learning_rate": 8.295454545454547e-06, + "loss": 0.4548, + "num_tokens": 56002125.0, + "step": 74 + }, + { + "epoch": 0.10241857190103805, + "grad_norm": 0.35190549612706723, + "learning_rate": 8.40909090909091e-06, + "loss": 0.4447, + "num_tokens": 56831886.0, + "step": 75 + }, + { + "epoch": 0.10378415285971856, + "grad_norm": 0.3770860001910229, + "learning_rate": 8.522727272727273e-06, + "loss": 0.4075, + "num_tokens": 57576284.0, + "step": 76 + }, + { + "epoch": 0.10514973381839907, + "grad_norm": 0.3947836413692913, + "learning_rate": 8.636363636363637e-06, + "loss": 0.4224, + "num_tokens": 58350348.0, + "step": 77 + }, + { + "epoch": 0.10651531477707958, + "grad_norm": 0.3697600082612356, + "learning_rate": 8.750000000000001e-06, + "loss": 0.4312, + "num_tokens": 59061243.0, + "step": 78 + }, + { + "epoch": 0.10788089573576008, + "grad_norm": 0.3391868653260943, + "learning_rate": 8.863636363636365e-06, + "loss": 0.4111, + "num_tokens": 59804038.0, + "step": 79 + }, + { + "epoch": 0.1092464766944406, + "grad_norm": 0.37185165832650896, + "learning_rate": 8.977272727272727e-06, + "loss": 0.4361, + "num_tokens": 60563985.0, + "step": 80 + }, + { + "epoch": 0.1106120576531211, + "grad_norm": 0.3328828068083075, + "learning_rate": 9.090909090909091e-06, + "loss": 0.411, + "num_tokens": 61329157.0, + "step": 81 + }, + { + "epoch": 0.1119776386118016, + "grad_norm": 0.3104215777046083, + "learning_rate": 9.204545454545455e-06, + "loss": 0.4083, + "num_tokens": 62190537.0, + "step": 82 + }, + { + "epoch": 0.11334321957048211, + "grad_norm": 0.34714189341720686, + "learning_rate": 9.318181818181819e-06, + "loss": 0.4184, + "num_tokens": 62999763.0, + "step": 83 + }, + { + "epoch": 0.11470880052916262, + "grad_norm": 0.3290620622380869, + "learning_rate": 9.431818181818183e-06, + "loss": 0.4126, + "num_tokens": 63729924.0, + "step": 84 + }, + { + "epoch": 0.11607438148784313, + "grad_norm": 0.3805923225981605, + "learning_rate": 9.545454545454547e-06, + "loss": 0.4216, + "num_tokens": 64474072.0, + "step": 85 + }, + { + "epoch": 0.11743996244652363, + "grad_norm": 0.30722391916270153, + "learning_rate": 9.65909090909091e-06, + "loss": 0.4189, + "num_tokens": 65266387.0, + "step": 86 + }, + { + "epoch": 0.11880554340520415, + "grad_norm": 0.3120134564429968, + "learning_rate": 9.772727272727273e-06, + "loss": 0.4291, + "num_tokens": 66001793.0, + "step": 87 + }, + { + "epoch": 0.12017112436388465, + "grad_norm": 0.32663742684678004, + "learning_rate": 9.886363636363637e-06, + "loss": 0.4225, + "num_tokens": 66749495.0, + "step": 88 + }, + { + "epoch": 0.12153670532256516, + "grad_norm": 0.31168083838187166, + "learning_rate": 1e-05, + "loss": 0.4386, + "num_tokens": 67564234.0, + "step": 89 + }, + { + "epoch": 0.12290228628124567, + "grad_norm": 0.29981938869033276, + "learning_rate": 1.0113636363636365e-05, + "loss": 0.4244, + "num_tokens": 68330993.0, + "step": 90 + }, + { + "epoch": 0.12426786723992617, + "grad_norm": 0.31406004011736194, + "learning_rate": 1.0227272727272729e-05, + "loss": 0.4226, + "num_tokens": 69125869.0, + "step": 91 + }, + { + "epoch": 0.12563344819860667, + "grad_norm": 0.30936526645695506, + "learning_rate": 1.0340909090909093e-05, + "loss": 0.4245, + "num_tokens": 69891218.0, + "step": 92 + }, + { + "epoch": 0.1269990291572872, + "grad_norm": 0.36236097496678255, + "learning_rate": 1.0454545454545455e-05, + "loss": 0.4241, + "num_tokens": 70631922.0, + "step": 93 + }, + { + "epoch": 0.1283646101159677, + "grad_norm": 0.3318515908322732, + "learning_rate": 1.056818181818182e-05, + "loss": 0.4437, + "num_tokens": 71369020.0, + "step": 94 + }, + { + "epoch": 0.1297301910746482, + "grad_norm": 0.3461896686234585, + "learning_rate": 1.0681818181818183e-05, + "loss": 0.4027, + "num_tokens": 72145740.0, + "step": 95 + }, + { + "epoch": 0.1310957720333287, + "grad_norm": 0.3860835709036509, + "learning_rate": 1.0795454545454547e-05, + "loss": 0.3974, + "num_tokens": 72920824.0, + "step": 96 + }, + { + "epoch": 0.13246135299200923, + "grad_norm": 0.3643110918215684, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.4189, + "num_tokens": 73771316.0, + "step": 97 + }, + { + "epoch": 0.13382693395068973, + "grad_norm": 0.3353257666171098, + "learning_rate": 1.1022727272727275e-05, + "loss": 0.404, + "num_tokens": 74515867.0, + "step": 98 + }, + { + "epoch": 0.13519251490937023, + "grad_norm": 0.35881084614551506, + "learning_rate": 1.1136363636363637e-05, + "loss": 0.4258, + "num_tokens": 75280980.0, + "step": 99 + }, + { + "epoch": 0.13655809586805073, + "grad_norm": 0.3290476032876705, + "learning_rate": 1.125e-05, + "loss": 0.4251, + "num_tokens": 76045199.0, + "step": 100 + }, + { + "epoch": 0.13792367682673123, + "grad_norm": 0.35757146172347526, + "learning_rate": 1.1363636363636366e-05, + "loss": 0.4042, + "num_tokens": 76836546.0, + "step": 101 + }, + { + "epoch": 0.13928925778541176, + "grad_norm": 0.3635084532240097, + "learning_rate": 1.1477272727272729e-05, + "loss": 0.4231, + "num_tokens": 77610906.0, + "step": 102 + }, + { + "epoch": 0.14065483874409226, + "grad_norm": 0.306860085328856, + "learning_rate": 1.1590909090909093e-05, + "loss": 0.4162, + "num_tokens": 78465370.0, + "step": 103 + }, + { + "epoch": 0.14202041970277277, + "grad_norm": 0.33073707281264003, + "learning_rate": 1.1704545454545455e-05, + "loss": 0.4122, + "num_tokens": 79261238.0, + "step": 104 + }, + { + "epoch": 0.14338600066145327, + "grad_norm": 0.3099173658517829, + "learning_rate": 1.181818181818182e-05, + "loss": 0.4174, + "num_tokens": 80056201.0, + "step": 105 + }, + { + "epoch": 0.1447515816201338, + "grad_norm": 0.3279392298917046, + "learning_rate": 1.1931818181818183e-05, + "loss": 0.4032, + "num_tokens": 80790155.0, + "step": 106 + }, + { + "epoch": 0.1461171625788143, + "grad_norm": 0.3264212075539135, + "learning_rate": 1.2045454545454547e-05, + "loss": 0.4213, + "num_tokens": 81520104.0, + "step": 107 + }, + { + "epoch": 0.1474827435374948, + "grad_norm": 0.37445493702708793, + "learning_rate": 1.2159090909090909e-05, + "loss": 0.4302, + "num_tokens": 82261359.0, + "step": 108 + }, + { + "epoch": 0.1488483244961753, + "grad_norm": 0.34114998615379194, + "learning_rate": 1.2272727272727274e-05, + "loss": 0.4196, + "num_tokens": 83051862.0, + "step": 109 + }, + { + "epoch": 0.1502139054548558, + "grad_norm": 0.3541077945442501, + "learning_rate": 1.2386363636363637e-05, + "loss": 0.4126, + "num_tokens": 83800921.0, + "step": 110 + }, + { + "epoch": 0.15157948641353633, + "grad_norm": 0.3665298642316797, + "learning_rate": 1.25e-05, + "loss": 0.399, + "num_tokens": 84463933.0, + "step": 111 + }, + { + "epoch": 0.15294506737221683, + "grad_norm": 0.37794199681419804, + "learning_rate": 1.2613636363636366e-05, + "loss": 0.3977, + "num_tokens": 85234195.0, + "step": 112 + }, + { + "epoch": 0.15431064833089733, + "grad_norm": 0.4150871065411515, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.4207, + "num_tokens": 85990575.0, + "step": 113 + }, + { + "epoch": 0.15567622928957783, + "grad_norm": 0.41004065320799266, + "learning_rate": 1.2840909090909092e-05, + "loss": 0.4441, + "num_tokens": 86796478.0, + "step": 114 + }, + { + "epoch": 0.15704181024825836, + "grad_norm": 0.43529548672767665, + "learning_rate": 1.2954545454545455e-05, + "loss": 0.4206, + "num_tokens": 87587191.0, + "step": 115 + }, + { + "epoch": 0.15840739120693886, + "grad_norm": 0.4172472835772534, + "learning_rate": 1.306818181818182e-05, + "loss": 0.4079, + "num_tokens": 88318757.0, + "step": 116 + }, + { + "epoch": 0.15977297216561936, + "grad_norm": 0.38521023276269134, + "learning_rate": 1.3181818181818183e-05, + "loss": 0.4008, + "num_tokens": 89117031.0, + "step": 117 + }, + { + "epoch": 0.16113855312429987, + "grad_norm": 0.45228331579152686, + "learning_rate": 1.3295454545454546e-05, + "loss": 0.4206, + "num_tokens": 89863782.0, + "step": 118 + }, + { + "epoch": 0.16250413408298037, + "grad_norm": 0.3186124048150442, + "learning_rate": 1.3409090909090909e-05, + "loss": 0.3922, + "num_tokens": 90626599.0, + "step": 119 + }, + { + "epoch": 0.1638697150416609, + "grad_norm": 0.341038521251306, + "learning_rate": 1.3522727272727274e-05, + "loss": 0.4171, + "num_tokens": 91426268.0, + "step": 120 + }, + { + "epoch": 0.1652352960003414, + "grad_norm": 0.3119171227854974, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.4053, + "num_tokens": 92168484.0, + "step": 121 + }, + { + "epoch": 0.1666008769590219, + "grad_norm": 0.3215289788156116, + "learning_rate": 1.375e-05, + "loss": 0.4179, + "num_tokens": 93022732.0, + "step": 122 + }, + { + "epoch": 0.1679664579177024, + "grad_norm": 0.3503658529028788, + "learning_rate": 1.3863636363636366e-05, + "loss": 0.3958, + "num_tokens": 93780799.0, + "step": 123 + }, + { + "epoch": 0.16933203887638293, + "grad_norm": 0.36630314169023653, + "learning_rate": 1.3977272727272728e-05, + "loss": 0.4338, + "num_tokens": 94536625.0, + "step": 124 + }, + { + "epoch": 0.17069761983506343, + "grad_norm": 0.37764181000226554, + "learning_rate": 1.4090909090909092e-05, + "loss": 0.4195, + "num_tokens": 95331936.0, + "step": 125 + }, + { + "epoch": 0.17206320079374393, + "grad_norm": 0.40094960882392333, + "learning_rate": 1.4204545454545455e-05, + "loss": 0.4008, + "num_tokens": 96013577.0, + "step": 126 + }, + { + "epoch": 0.17342878175242443, + "grad_norm": 0.4224329999385295, + "learning_rate": 1.431818181818182e-05, + "loss": 0.4352, + "num_tokens": 96777177.0, + "step": 127 + }, + { + "epoch": 0.17479436271110493, + "grad_norm": 0.3659015714456921, + "learning_rate": 1.4431818181818182e-05, + "loss": 0.405, + "num_tokens": 97649984.0, + "step": 128 + }, + { + "epoch": 0.17615994366978546, + "grad_norm": 0.3847777214467326, + "learning_rate": 1.4545454545454546e-05, + "loss": 0.4165, + "num_tokens": 98426131.0, + "step": 129 + }, + { + "epoch": 0.17752552462846596, + "grad_norm": 0.32688609754166364, + "learning_rate": 1.465909090909091e-05, + "loss": 0.3959, + "num_tokens": 99280265.0, + "step": 130 + }, + { + "epoch": 0.17889110558714647, + "grad_norm": 0.44502582613703623, + "learning_rate": 1.4772727272727274e-05, + "loss": 0.4202, + "num_tokens": 100068369.0, + "step": 131 + }, + { + "epoch": 0.18025668654582697, + "grad_norm": 0.3718517597229538, + "learning_rate": 1.4886363636363636e-05, + "loss": 0.4075, + "num_tokens": 100866816.0, + "step": 132 + }, + { + "epoch": 0.1816222675045075, + "grad_norm": 0.4033353268247993, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.4122, + "num_tokens": 101644474.0, + "step": 133 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.41126550211732105, + "learning_rate": 1.5113636363636366e-05, + "loss": 0.4096, + "num_tokens": 102393514.0, + "step": 134 + }, + { + "epoch": 0.1843534294218685, + "grad_norm": 0.3712967075606849, + "learning_rate": 1.5227272727272728e-05, + "loss": 0.4032, + "num_tokens": 103111951.0, + "step": 135 + }, + { + "epoch": 0.185719010380549, + "grad_norm": 0.3767775569249854, + "learning_rate": 1.5340909090909094e-05, + "loss": 0.416, + "num_tokens": 103864022.0, + "step": 136 + }, + { + "epoch": 0.1870845913392295, + "grad_norm": 0.375238907768676, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.4221, + "num_tokens": 104563581.0, + "step": 137 + }, + { + "epoch": 0.18845017229791003, + "grad_norm": 0.39224381524332996, + "learning_rate": 1.5568181818181822e-05, + "loss": 0.4163, + "num_tokens": 105282648.0, + "step": 138 + }, + { + "epoch": 0.18981575325659053, + "grad_norm": 0.36362163949154414, + "learning_rate": 1.5681818181818182e-05, + "loss": 0.4107, + "num_tokens": 105977886.0, + "step": 139 + }, + { + "epoch": 0.19118133421527103, + "grad_norm": 0.44442549741258364, + "learning_rate": 1.5795454545454546e-05, + "loss": 0.4309, + "num_tokens": 106783806.0, + "step": 140 + }, + { + "epoch": 0.19254691517395153, + "grad_norm": 0.3348040948116041, + "learning_rate": 1.590909090909091e-05, + "loss": 0.3798, + "num_tokens": 107499922.0, + "step": 141 + }, + { + "epoch": 0.19391249613263206, + "grad_norm": 0.4054092517318534, + "learning_rate": 1.6022727272727274e-05, + "loss": 0.4052, + "num_tokens": 108277422.0, + "step": 142 + }, + { + "epoch": 0.19527807709131256, + "grad_norm": 0.36332864858879105, + "learning_rate": 1.6136363636363638e-05, + "loss": 0.4028, + "num_tokens": 109114032.0, + "step": 143 + }, + { + "epoch": 0.19664365804999306, + "grad_norm": 0.36734020195853057, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.3969, + "num_tokens": 109899718.0, + "step": 144 + }, + { + "epoch": 0.19800923900867357, + "grad_norm": 0.411722441842276, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.4087, + "num_tokens": 110717613.0, + "step": 145 + }, + { + "epoch": 0.19937481996735407, + "grad_norm": 0.3638291017386694, + "learning_rate": 1.647727272727273e-05, + "loss": 0.408, + "num_tokens": 111439912.0, + "step": 146 + }, + { + "epoch": 0.2007404009260346, + "grad_norm": 0.39153546417238255, + "learning_rate": 1.6590909090909094e-05, + "loss": 0.412, + "num_tokens": 112180564.0, + "step": 147 + }, + { + "epoch": 0.2021059818847151, + "grad_norm": 0.3979231399520793, + "learning_rate": 1.6704545454545454e-05, + "loss": 0.4266, + "num_tokens": 112954449.0, + "step": 148 + }, + { + "epoch": 0.2034715628433956, + "grad_norm": 0.35474285289086577, + "learning_rate": 1.681818181818182e-05, + "loss": 0.4032, + "num_tokens": 113701483.0, + "step": 149 + }, + { + "epoch": 0.2048371438020761, + "grad_norm": 0.41960532235754483, + "learning_rate": 1.6931818181818182e-05, + "loss": 0.4121, + "num_tokens": 114385292.0, + "step": 150 + }, + { + "epoch": 0.20620272476075663, + "grad_norm": 0.3208323074924627, + "learning_rate": 1.7045454545454546e-05, + "loss": 0.4124, + "num_tokens": 115144805.0, + "step": 151 + }, + { + "epoch": 0.20756830571943713, + "grad_norm": 0.3918541853413258, + "learning_rate": 1.715909090909091e-05, + "loss": 0.3933, + "num_tokens": 115906063.0, + "step": 152 + }, + { + "epoch": 0.20893388667811763, + "grad_norm": 0.38256450822772764, + "learning_rate": 1.7272727272727274e-05, + "loss": 0.3957, + "num_tokens": 116680256.0, + "step": 153 + }, + { + "epoch": 0.21029946763679813, + "grad_norm": 0.3926981510841826, + "learning_rate": 1.7386363636363638e-05, + "loss": 0.4037, + "num_tokens": 117453273.0, + "step": 154 + }, + { + "epoch": 0.21166504859547863, + "grad_norm": 0.35981922549246803, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.3919, + "num_tokens": 118305135.0, + "step": 155 + }, + { + "epoch": 0.21303062955415916, + "grad_norm": 0.393283477728591, + "learning_rate": 1.7613636363636366e-05, + "loss": 0.417, + "num_tokens": 119052182.0, + "step": 156 + }, + { + "epoch": 0.21439621051283966, + "grad_norm": 0.4037922402082293, + "learning_rate": 1.772727272727273e-05, + "loss": 0.3872, + "num_tokens": 119755320.0, + "step": 157 + }, + { + "epoch": 0.21576179147152016, + "grad_norm": 0.39561764727128945, + "learning_rate": 1.7840909090909094e-05, + "loss": 0.4259, + "num_tokens": 120582682.0, + "step": 158 + }, + { + "epoch": 0.21712737243020067, + "grad_norm": 0.35894057299289617, + "learning_rate": 1.7954545454545454e-05, + "loss": 0.412, + "num_tokens": 121414797.0, + "step": 159 + }, + { + "epoch": 0.2184929533888812, + "grad_norm": 0.39124239469182126, + "learning_rate": 1.806818181818182e-05, + "loss": 0.3916, + "num_tokens": 122173384.0, + "step": 160 + }, + { + "epoch": 0.2198585343475617, + "grad_norm": 0.3770835121096991, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.4095, + "num_tokens": 122918408.0, + "step": 161 + }, + { + "epoch": 0.2212241153062422, + "grad_norm": 0.38063325493115563, + "learning_rate": 1.8295454545454546e-05, + "loss": 0.4097, + "num_tokens": 123669596.0, + "step": 162 + }, + { + "epoch": 0.2225896962649227, + "grad_norm": 0.37518397307666107, + "learning_rate": 1.840909090909091e-05, + "loss": 0.4244, + "num_tokens": 124429571.0, + "step": 163 + }, + { + "epoch": 0.2239552772236032, + "grad_norm": 0.3588695560474301, + "learning_rate": 1.8522727272727274e-05, + "loss": 0.4104, + "num_tokens": 125196963.0, + "step": 164 + }, + { + "epoch": 0.22532085818228373, + "grad_norm": 0.36889996137627357, + "learning_rate": 1.8636363636363638e-05, + "loss": 0.3946, + "num_tokens": 125949052.0, + "step": 165 + }, + { + "epoch": 0.22668643914096423, + "grad_norm": 0.3942017445331087, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.3907, + "num_tokens": 126742770.0, + "step": 166 + }, + { + "epoch": 0.22805202009964473, + "grad_norm": 0.35061375730916733, + "learning_rate": 1.8863636363636366e-05, + "loss": 0.3958, + "num_tokens": 127558819.0, + "step": 167 + }, + { + "epoch": 0.22941760105832523, + "grad_norm": 0.4652802192938087, + "learning_rate": 1.897727272727273e-05, + "loss": 0.4087, + "num_tokens": 128286834.0, + "step": 168 + }, + { + "epoch": 0.23078318201700576, + "grad_norm": 0.4081948054772298, + "learning_rate": 1.9090909090909094e-05, + "loss": 0.4024, + "num_tokens": 129132344.0, + "step": 169 + }, + { + "epoch": 0.23214876297568626, + "grad_norm": 0.4175284220879797, + "learning_rate": 1.9204545454545454e-05, + "loss": 0.4243, + "num_tokens": 129824101.0, + "step": 170 + }, + { + "epoch": 0.23351434393436676, + "grad_norm": 0.45332666037123026, + "learning_rate": 1.931818181818182e-05, + "loss": 0.4015, + "num_tokens": 130556473.0, + "step": 171 + }, + { + "epoch": 0.23487992489304726, + "grad_norm": 0.4152289920667755, + "learning_rate": 1.9431818181818182e-05, + "loss": 0.4122, + "num_tokens": 131291044.0, + "step": 172 + }, + { + "epoch": 0.23624550585172777, + "grad_norm": 0.49231460645116015, + "learning_rate": 1.9545454545454546e-05, + "loss": 0.4377, + "num_tokens": 132084966.0, + "step": 173 + }, + { + "epoch": 0.2376110868104083, + "grad_norm": 0.32369078846343474, + "learning_rate": 1.965909090909091e-05, + "loss": 0.4092, + "num_tokens": 132902353.0, + "step": 174 + }, + { + "epoch": 0.2389766677690888, + "grad_norm": 0.5161325874763318, + "learning_rate": 1.9772727272727274e-05, + "loss": 0.4215, + "num_tokens": 133622462.0, + "step": 175 + }, + { + "epoch": 0.2403422487277693, + "grad_norm": 0.33193016199489667, + "learning_rate": 1.9886363636363638e-05, + "loss": 0.3971, + "num_tokens": 134319877.0, + "step": 176 + }, + { + "epoch": 0.2417078296864498, + "grad_norm": 0.4798974266530614, + "learning_rate": 2e-05, + "loss": 0.396, + "num_tokens": 135041196.0, + "step": 177 + }, + { + "epoch": 0.24307341064513033, + "grad_norm": 0.384269231947528, + "learning_rate": 1.999999862724488e-05, + "loss": 0.3957, + "num_tokens": 135748184.0, + "step": 178 + }, + { + "epoch": 0.24443899160381083, + "grad_norm": 0.4673196178393515, + "learning_rate": 1.9999994508979935e-05, + "loss": 0.394, + "num_tokens": 136581030.0, + "step": 179 + }, + { + "epoch": 0.24580457256249133, + "grad_norm": 0.37523509594136417, + "learning_rate": 1.9999987645206422e-05, + "loss": 0.422, + "num_tokens": 137373989.0, + "step": 180 + }, + { + "epoch": 0.24717015352117183, + "grad_norm": 0.4605013266444706, + "learning_rate": 1.999997803592643e-05, + "loss": 0.3974, + "num_tokens": 138117615.0, + "step": 181 + }, + { + "epoch": 0.24853573447985233, + "grad_norm": 0.35013465535003613, + "learning_rate": 1.9999965681142902e-05, + "loss": 0.395, + "num_tokens": 138849889.0, + "step": 182 + }, + { + "epoch": 0.24990131543853286, + "grad_norm": 0.4634961786265312, + "learning_rate": 1.9999950580859594e-05, + "loss": 0.3974, + "num_tokens": 139648257.0, + "step": 183 + }, + { + "epoch": 0.25126689639721334, + "grad_norm": 0.3564681330312269, + "learning_rate": 1.9999932735081123e-05, + "loss": 0.3939, + "num_tokens": 140300071.0, + "step": 184 + }, + { + "epoch": 0.2526324773558939, + "grad_norm": 0.5212925010332135, + "learning_rate": 1.9999912143812927e-05, + "loss": 0.4056, + "num_tokens": 141035858.0, + "step": 185 + }, + { + "epoch": 0.2539980583145744, + "grad_norm": 0.4182428311993074, + "learning_rate": 1.9999888807061288e-05, + "loss": 0.3914, + "num_tokens": 141908782.0, + "step": 186 + }, + { + "epoch": 0.2553636392732549, + "grad_norm": 0.49147903642173646, + "learning_rate": 1.9999862724833328e-05, + "loss": 0.4409, + "num_tokens": 142717216.0, + "step": 187 + }, + { + "epoch": 0.2567292202319354, + "grad_norm": 0.34481147969504883, + "learning_rate": 1.9999833897137e-05, + "loss": 0.4095, + "num_tokens": 143499035.0, + "step": 188 + }, + { + "epoch": 0.2580948011906159, + "grad_norm": 0.42755706915596964, + "learning_rate": 1.99998023239811e-05, + "loss": 0.3927, + "num_tokens": 144262477.0, + "step": 189 + }, + { + "epoch": 0.2594603821492964, + "grad_norm": 0.4180818729161203, + "learning_rate": 1.999976800537526e-05, + "loss": 0.3963, + "num_tokens": 144972290.0, + "step": 190 + }, + { + "epoch": 0.2608259631079769, + "grad_norm": 0.3959702147826673, + "learning_rate": 1.9999730941329947e-05, + "loss": 0.3945, + "num_tokens": 145734994.0, + "step": 191 + }, + { + "epoch": 0.2621915440666574, + "grad_norm": 0.39347136364939717, + "learning_rate": 1.999969113185647e-05, + "loss": 0.3933, + "num_tokens": 146518634.0, + "step": 192 + }, + { + "epoch": 0.2635571250253379, + "grad_norm": 0.3561674216343085, + "learning_rate": 1.9999648576966974e-05, + "loss": 0.3969, + "num_tokens": 147199487.0, + "step": 193 + }, + { + "epoch": 0.26492270598401846, + "grad_norm": 0.41008506362832686, + "learning_rate": 1.9999603276674434e-05, + "loss": 0.3924, + "num_tokens": 147964517.0, + "step": 194 + }, + { + "epoch": 0.26628828694269896, + "grad_norm": 0.39271386473573855, + "learning_rate": 1.999955523099268e-05, + "loss": 0.4091, + "num_tokens": 148768364.0, + "step": 195 + }, + { + "epoch": 0.26765386790137946, + "grad_norm": 0.4448205584781951, + "learning_rate": 1.9999504439936358e-05, + "loss": 0.4145, + "num_tokens": 149581041.0, + "step": 196 + }, + { + "epoch": 0.26901944886005996, + "grad_norm": 0.3820873204035089, + "learning_rate": 1.999945090352097e-05, + "loss": 0.4024, + "num_tokens": 150376574.0, + "step": 197 + }, + { + "epoch": 0.27038502981874046, + "grad_norm": 0.4655397720104624, + "learning_rate": 1.999939462176284e-05, + "loss": 0.3974, + "num_tokens": 151084782.0, + "step": 198 + }, + { + "epoch": 0.27175061077742096, + "grad_norm": 0.3880409470247186, + "learning_rate": 1.9999335594679144e-05, + "loss": 0.4097, + "num_tokens": 151869753.0, + "step": 199 + }, + { + "epoch": 0.27311619173610147, + "grad_norm": 0.43087037053967353, + "learning_rate": 1.999927382228789e-05, + "loss": 0.4001, + "num_tokens": 152631222.0, + "step": 200 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.38461100559019773, + "learning_rate": 1.9999209304607913e-05, + "loss": 0.3819, + "num_tokens": 153384826.0, + "step": 201 + }, + { + "epoch": 0.27584735365346247, + "grad_norm": 0.38468162299126785, + "learning_rate": 1.9999142041658898e-05, + "loss": 0.3933, + "num_tokens": 154123081.0, + "step": 202 + }, + { + "epoch": 0.277212934612143, + "grad_norm": 0.4111234140427244, + "learning_rate": 1.9999072033461372e-05, + "loss": 0.4122, + "num_tokens": 154974829.0, + "step": 203 + }, + { + "epoch": 0.2785785155708235, + "grad_norm": 0.32232552516136564, + "learning_rate": 1.9998999280036682e-05, + "loss": 0.3984, + "num_tokens": 155689814.0, + "step": 204 + }, + { + "epoch": 0.279944096529504, + "grad_norm": 0.38159043710187907, + "learning_rate": 1.9998923781407025e-05, + "loss": 0.409, + "num_tokens": 156498336.0, + "step": 205 + }, + { + "epoch": 0.28130967748818453, + "grad_norm": 0.3231936218372313, + "learning_rate": 1.9998845537595432e-05, + "loss": 0.409, + "num_tokens": 157211030.0, + "step": 206 + }, + { + "epoch": 0.28267525844686503, + "grad_norm": 0.39701233066766234, + "learning_rate": 1.999876454862577e-05, + "loss": 0.3868, + "num_tokens": 157935970.0, + "step": 207 + }, + { + "epoch": 0.28404083940554553, + "grad_norm": 0.3473286689561777, + "learning_rate": 1.9998680814522756e-05, + "loss": 0.385, + "num_tokens": 158711701.0, + "step": 208 + }, + { + "epoch": 0.28540642036422603, + "grad_norm": 0.39949594661313825, + "learning_rate": 1.999859433531192e-05, + "loss": 0.4084, + "num_tokens": 159464623.0, + "step": 209 + }, + { + "epoch": 0.28677200132290653, + "grad_norm": 0.39437860073351505, + "learning_rate": 1.9998505111019648e-05, + "loss": 0.4153, + "num_tokens": 160241725.0, + "step": 210 + }, + { + "epoch": 0.28813758228158703, + "grad_norm": 0.3562930703448643, + "learning_rate": 1.9998413141673162e-05, + "loss": 0.3916, + "num_tokens": 160978940.0, + "step": 211 + }, + { + "epoch": 0.2895031632402676, + "grad_norm": 0.3659820926431436, + "learning_rate": 1.999831842730051e-05, + "loss": 0.3983, + "num_tokens": 161779321.0, + "step": 212 + }, + { + "epoch": 0.2908687441989481, + "grad_norm": 0.3575333121641269, + "learning_rate": 1.999822096793059e-05, + "loss": 0.4104, + "num_tokens": 162530909.0, + "step": 213 + }, + { + "epoch": 0.2922343251576286, + "grad_norm": 0.33338314476453984, + "learning_rate": 1.9998120763593137e-05, + "loss": 0.419, + "num_tokens": 163320760.0, + "step": 214 + }, + { + "epoch": 0.2935999061163091, + "grad_norm": 0.397913734510872, + "learning_rate": 1.9998017814318712e-05, + "loss": 0.3949, + "num_tokens": 164174798.0, + "step": 215 + }, + { + "epoch": 0.2949654870749896, + "grad_norm": 0.27986609445997696, + "learning_rate": 1.999791212013872e-05, + "loss": 0.3827, + "num_tokens": 164897363.0, + "step": 216 + }, + { + "epoch": 0.2963310680336701, + "grad_norm": 0.3450032237823481, + "learning_rate": 1.9997803681085412e-05, + "loss": 0.3858, + "num_tokens": 165686427.0, + "step": 217 + }, + { + "epoch": 0.2976966489923506, + "grad_norm": 0.32254629341332614, + "learning_rate": 1.999769249719186e-05, + "loss": 0.3877, + "num_tokens": 166498716.0, + "step": 218 + }, + { + "epoch": 0.2990622299510311, + "grad_norm": 0.2791852188392648, + "learning_rate": 1.9997578568491983e-05, + "loss": 0.3764, + "num_tokens": 167265170.0, + "step": 219 + }, + { + "epoch": 0.3004278109097116, + "grad_norm": 0.4004359968618145, + "learning_rate": 1.9997461895020538e-05, + "loss": 0.4077, + "num_tokens": 168029418.0, + "step": 220 + }, + { + "epoch": 0.30179339186839216, + "grad_norm": 0.3049469446124067, + "learning_rate": 1.9997342476813116e-05, + "loss": 0.4008, + "num_tokens": 168831970.0, + "step": 221 + }, + { + "epoch": 0.30315897282707266, + "grad_norm": 0.41221492444530605, + "learning_rate": 1.9997220313906145e-05, + "loss": 0.4171, + "num_tokens": 169643664.0, + "step": 222 + }, + { + "epoch": 0.30452455378575316, + "grad_norm": 0.3516336492924699, + "learning_rate": 1.9997095406336892e-05, + "loss": 0.4036, + "num_tokens": 170389390.0, + "step": 223 + }, + { + "epoch": 0.30589013474443366, + "grad_norm": 0.375968127066635, + "learning_rate": 1.9996967754143467e-05, + "loss": 0.3901, + "num_tokens": 171120690.0, + "step": 224 + }, + { + "epoch": 0.30725571570311416, + "grad_norm": 0.3183575086829415, + "learning_rate": 1.99968373573648e-05, + "loss": 0.3828, + "num_tokens": 171934188.0, + "step": 225 + }, + { + "epoch": 0.30862129666179466, + "grad_norm": 0.29167742320642454, + "learning_rate": 1.9996704216040675e-05, + "loss": 0.3902, + "num_tokens": 172755877.0, + "step": 226 + }, + { + "epoch": 0.30998687762047517, + "grad_norm": 0.3745152571978657, + "learning_rate": 1.999656833021171e-05, + "loss": 0.4057, + "num_tokens": 173513956.0, + "step": 227 + }, + { + "epoch": 0.31135245857915567, + "grad_norm": 0.3155968331719428, + "learning_rate": 1.9996429699919353e-05, + "loss": 0.4003, + "num_tokens": 174298361.0, + "step": 228 + }, + { + "epoch": 0.31271803953783617, + "grad_norm": 0.30620235017275976, + "learning_rate": 1.99962883252059e-05, + "loss": 0.3998, + "num_tokens": 175161082.0, + "step": 229 + }, + { + "epoch": 0.3140836204965167, + "grad_norm": 0.2993754779819447, + "learning_rate": 1.9996144206114477e-05, + "loss": 0.3838, + "num_tokens": 175888547.0, + "step": 230 + }, + { + "epoch": 0.3154492014551972, + "grad_norm": 0.3042805556881561, + "learning_rate": 1.9995997342689044e-05, + "loss": 0.3916, + "num_tokens": 176701182.0, + "step": 231 + }, + { + "epoch": 0.3168147824138777, + "grad_norm": 0.3338530354898371, + "learning_rate": 1.9995847734974403e-05, + "loss": 0.3734, + "num_tokens": 177456235.0, + "step": 232 + }, + { + "epoch": 0.31818036337255823, + "grad_norm": 0.325114394134763, + "learning_rate": 1.99956953830162e-05, + "loss": 0.3873, + "num_tokens": 178193394.0, + "step": 233 + }, + { + "epoch": 0.31954594433123873, + "grad_norm": 0.42273079346990633, + "learning_rate": 1.9995540286860902e-05, + "loss": 0.3915, + "num_tokens": 178913206.0, + "step": 234 + }, + { + "epoch": 0.32091152528991923, + "grad_norm": 0.38867419304556305, + "learning_rate": 1.9995382446555826e-05, + "loss": 0.403, + "num_tokens": 179715440.0, + "step": 235 + }, + { + "epoch": 0.32227710624859973, + "grad_norm": 0.35126682720363056, + "learning_rate": 1.9995221862149126e-05, + "loss": 0.4096, + "num_tokens": 180486851.0, + "step": 236 + }, + { + "epoch": 0.32364268720728023, + "grad_norm": 0.3540006171419841, + "learning_rate": 1.9995058533689784e-05, + "loss": 0.4178, + "num_tokens": 181295035.0, + "step": 237 + }, + { + "epoch": 0.32500826816596073, + "grad_norm": 0.3249230249417242, + "learning_rate": 1.9994892461227626e-05, + "loss": 0.3768, + "num_tokens": 182036842.0, + "step": 238 + }, + { + "epoch": 0.3263738491246413, + "grad_norm": 0.31634248166236556, + "learning_rate": 1.999472364481331e-05, + "loss": 0.4005, + "num_tokens": 182764703.0, + "step": 239 + }, + { + "epoch": 0.3277394300833218, + "grad_norm": 0.316816872669518, + "learning_rate": 1.9994552084498345e-05, + "loss": 0.3954, + "num_tokens": 183540023.0, + "step": 240 + }, + { + "epoch": 0.3291050110420023, + "grad_norm": 0.3804437991638129, + "learning_rate": 1.9994377780335056e-05, + "loss": 0.3962, + "num_tokens": 184271977.0, + "step": 241 + }, + { + "epoch": 0.3304705920006828, + "grad_norm": 0.3163469468649419, + "learning_rate": 1.9994200732376622e-05, + "loss": 0.4046, + "num_tokens": 185074936.0, + "step": 242 + }, + { + "epoch": 0.3318361729593633, + "grad_norm": 0.3880085468452102, + "learning_rate": 1.9994020940677047e-05, + "loss": 0.4155, + "num_tokens": 185912258.0, + "step": 243 + }, + { + "epoch": 0.3332017539180438, + "grad_norm": 0.3540815713190658, + "learning_rate": 1.9993838405291183e-05, + "loss": 0.3935, + "num_tokens": 186708027.0, + "step": 244 + }, + { + "epoch": 0.3345673348767243, + "grad_norm": 0.32729358132477676, + "learning_rate": 1.9993653126274712e-05, + "loss": 0.4094, + "num_tokens": 187484562.0, + "step": 245 + }, + { + "epoch": 0.3359329158354048, + "grad_norm": 0.33996263809705835, + "learning_rate": 1.9993465103684153e-05, + "loss": 0.3937, + "num_tokens": 188214559.0, + "step": 246 + }, + { + "epoch": 0.3372984967940853, + "grad_norm": 0.3072902200874743, + "learning_rate": 1.9993274337576866e-05, + "loss": 0.3921, + "num_tokens": 189003396.0, + "step": 247 + }, + { + "epoch": 0.33866407775276586, + "grad_norm": 0.3046749015339264, + "learning_rate": 1.9993080828011046e-05, + "loss": 0.3799, + "num_tokens": 189712206.0, + "step": 248 + }, + { + "epoch": 0.34002965871144636, + "grad_norm": 0.32137805243013307, + "learning_rate": 1.9992884575045717e-05, + "loss": 0.4022, + "num_tokens": 190557446.0, + "step": 249 + }, + { + "epoch": 0.34139523967012686, + "grad_norm": 0.32728603097870307, + "learning_rate": 1.9992685578740755e-05, + "loss": 0.3932, + "num_tokens": 191296974.0, + "step": 250 + }, + { + "epoch": 0.34276082062880736, + "grad_norm": 0.3099730220704177, + "learning_rate": 1.9992483839156868e-05, + "loss": 0.3867, + "num_tokens": 192090517.0, + "step": 251 + }, + { + "epoch": 0.34412640158748786, + "grad_norm": 0.2842483644080127, + "learning_rate": 1.9992279356355585e-05, + "loss": 0.392, + "num_tokens": 192824923.0, + "step": 252 + }, + { + "epoch": 0.34549198254616836, + "grad_norm": 0.3238047793809841, + "learning_rate": 1.9992072130399297e-05, + "loss": 0.3912, + "num_tokens": 193592604.0, + "step": 253 + }, + { + "epoch": 0.34685756350484886, + "grad_norm": 0.28721131927558613, + "learning_rate": 1.9991862161351218e-05, + "loss": 0.4012, + "num_tokens": 194343790.0, + "step": 254 + }, + { + "epoch": 0.34822314446352937, + "grad_norm": 0.3394688073397443, + "learning_rate": 1.9991649449275397e-05, + "loss": 0.3942, + "num_tokens": 195144936.0, + "step": 255 + }, + { + "epoch": 0.34958872542220987, + "grad_norm": 0.26264904539351003, + "learning_rate": 1.9991433994236723e-05, + "loss": 0.3799, + "num_tokens": 195911042.0, + "step": 256 + }, + { + "epoch": 0.3509543063808904, + "grad_norm": 0.3291469523238052, + "learning_rate": 1.999121579630092e-05, + "loss": 0.3852, + "num_tokens": 196669972.0, + "step": 257 + }, + { + "epoch": 0.3523198873395709, + "grad_norm": 0.3094973846307219, + "learning_rate": 1.999099485553456e-05, + "loss": 0.3987, + "num_tokens": 197435742.0, + "step": 258 + }, + { + "epoch": 0.3536854682982514, + "grad_norm": 0.324981801089362, + "learning_rate": 1.999077117200503e-05, + "loss": 0.3895, + "num_tokens": 198199606.0, + "step": 259 + }, + { + "epoch": 0.3550510492569319, + "grad_norm": 0.31470510510405425, + "learning_rate": 1.999054474578058e-05, + "loss": 0.3751, + "num_tokens": 198987046.0, + "step": 260 + }, + { + "epoch": 0.35641663021561243, + "grad_norm": 0.32930223251356155, + "learning_rate": 1.999031557693027e-05, + "loss": 0.3911, + "num_tokens": 199762693.0, + "step": 261 + }, + { + "epoch": 0.35778221117429293, + "grad_norm": 0.317962820115311, + "learning_rate": 1.9990083665524016e-05, + "loss": 0.4247, + "num_tokens": 200504594.0, + "step": 262 + }, + { + "epoch": 0.35914779213297343, + "grad_norm": 0.32831751675616766, + "learning_rate": 1.998984901163256e-05, + "loss": 0.3966, + "num_tokens": 201206950.0, + "step": 263 + }, + { + "epoch": 0.36051337309165393, + "grad_norm": 0.337537269391906, + "learning_rate": 1.9989611615327497e-05, + "loss": 0.4067, + "num_tokens": 201958474.0, + "step": 264 + }, + { + "epoch": 0.36187895405033443, + "grad_norm": 0.304514178353785, + "learning_rate": 1.998937147668123e-05, + "loss": 0.385, + "num_tokens": 202668583.0, + "step": 265 + }, + { + "epoch": 0.363244535009015, + "grad_norm": 0.27368560229670585, + "learning_rate": 1.9989128595767025e-05, + "loss": 0.3888, + "num_tokens": 203335965.0, + "step": 266 + }, + { + "epoch": 0.3646101159676955, + "grad_norm": 0.32944612825367875, + "learning_rate": 1.9988882972658967e-05, + "loss": 0.3933, + "num_tokens": 204113580.0, + "step": 267 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.28819118290481627, + "learning_rate": 1.9988634607431995e-05, + "loss": 0.3946, + "num_tokens": 204930237.0, + "step": 268 + }, + { + "epoch": 0.3673412778850565, + "grad_norm": 0.3140182585732985, + "learning_rate": 1.9988383500161866e-05, + "loss": 0.388, + "num_tokens": 205713593.0, + "step": 269 + }, + { + "epoch": 0.368706858843737, + "grad_norm": 0.28508464158957064, + "learning_rate": 1.9988129650925188e-05, + "loss": 0.3914, + "num_tokens": 206518217.0, + "step": 270 + }, + { + "epoch": 0.3700724398024175, + "grad_norm": 0.30894837924638346, + "learning_rate": 1.9987873059799393e-05, + "loss": 0.3901, + "num_tokens": 207298768.0, + "step": 271 + }, + { + "epoch": 0.371438020761098, + "grad_norm": 0.30752980076574654, + "learning_rate": 1.9987613726862757e-05, + "loss": 0.3909, + "num_tokens": 208115943.0, + "step": 272 + }, + { + "epoch": 0.3728036017197785, + "grad_norm": 0.3078288935321802, + "learning_rate": 1.9987351652194394e-05, + "loss": 0.3857, + "num_tokens": 208902284.0, + "step": 273 + }, + { + "epoch": 0.374169182678459, + "grad_norm": 0.301593513697497, + "learning_rate": 1.9987086835874256e-05, + "loss": 0.3961, + "num_tokens": 209613804.0, + "step": 274 + }, + { + "epoch": 0.37553476363713956, + "grad_norm": 0.280767654185102, + "learning_rate": 1.998681927798312e-05, + "loss": 0.3837, + "num_tokens": 210340571.0, + "step": 275 + }, + { + "epoch": 0.37690034459582006, + "grad_norm": 0.308310030043621, + "learning_rate": 1.9986548978602606e-05, + "loss": 0.3991, + "num_tokens": 211039158.0, + "step": 276 + }, + { + "epoch": 0.37826592555450056, + "grad_norm": 0.2664909217167294, + "learning_rate": 1.9986275937815178e-05, + "loss": 0.3934, + "num_tokens": 211825565.0, + "step": 277 + }, + { + "epoch": 0.37963150651318106, + "grad_norm": 0.2962525295222797, + "learning_rate": 1.998600015570412e-05, + "loss": 0.3918, + "num_tokens": 212582971.0, + "step": 278 + }, + { + "epoch": 0.38099708747186156, + "grad_norm": 0.2552381969232553, + "learning_rate": 1.9985721632353566e-05, + "loss": 0.3844, + "num_tokens": 213266102.0, + "step": 279 + }, + { + "epoch": 0.38236266843054206, + "grad_norm": 0.2883797413981686, + "learning_rate": 1.998544036784848e-05, + "loss": 0.4028, + "num_tokens": 214010120.0, + "step": 280 + }, + { + "epoch": 0.38372824938922256, + "grad_norm": 0.2860005852024724, + "learning_rate": 1.9985156362274663e-05, + "loss": 0.3889, + "num_tokens": 214823555.0, + "step": 281 + }, + { + "epoch": 0.38509383034790307, + "grad_norm": 0.29463479099759743, + "learning_rate": 1.9984869615718757e-05, + "loss": 0.4181, + "num_tokens": 215565525.0, + "step": 282 + }, + { + "epoch": 0.38645941130658357, + "grad_norm": 0.2739628123157226, + "learning_rate": 1.998458012826823e-05, + "loss": 0.3965, + "num_tokens": 216359979.0, + "step": 283 + }, + { + "epoch": 0.3878249922652641, + "grad_norm": 0.29734792567019613, + "learning_rate": 1.9984287900011398e-05, + "loss": 0.3927, + "num_tokens": 217119930.0, + "step": 284 + }, + { + "epoch": 0.3891905732239446, + "grad_norm": 0.27376649183184126, + "learning_rate": 1.9983992931037398e-05, + "loss": 0.3912, + "num_tokens": 217887396.0, + "step": 285 + }, + { + "epoch": 0.3905561541826251, + "grad_norm": 0.3030156285828892, + "learning_rate": 1.998369522143622e-05, + "loss": 0.3999, + "num_tokens": 218687938.0, + "step": 286 + }, + { + "epoch": 0.3919217351413056, + "grad_norm": 0.26357741774904286, + "learning_rate": 1.9983394771298687e-05, + "loss": 0.3819, + "num_tokens": 219500101.0, + "step": 287 + }, + { + "epoch": 0.39328731609998613, + "grad_norm": 0.27773932035001925, + "learning_rate": 1.998309158071644e-05, + "loss": 0.3865, + "num_tokens": 220242171.0, + "step": 288 + }, + { + "epoch": 0.39465289705866663, + "grad_norm": 0.30344000448105085, + "learning_rate": 1.9982785649781983e-05, + "loss": 0.396, + "num_tokens": 221055384.0, + "step": 289 + }, + { + "epoch": 0.39601847801734713, + "grad_norm": 0.2726114378102743, + "learning_rate": 1.998247697858863e-05, + "loss": 0.406, + "num_tokens": 221792382.0, + "step": 290 + }, + { + "epoch": 0.39738405897602763, + "grad_norm": 0.5283030074048782, + "learning_rate": 1.9982165567230552e-05, + "loss": 0.401, + "num_tokens": 222572810.0, + "step": 291 + }, + { + "epoch": 0.39874963993470813, + "grad_norm": 0.30188348683730365, + "learning_rate": 1.9981851415802743e-05, + "loss": 0.3855, + "num_tokens": 223262344.0, + "step": 292 + }, + { + "epoch": 0.4001152208933887, + "grad_norm": 0.31300225912713664, + "learning_rate": 1.998153452440104e-05, + "loss": 0.3905, + "num_tokens": 224015030.0, + "step": 293 + }, + { + "epoch": 0.4014808018520692, + "grad_norm": 0.2798364017827532, + "learning_rate": 1.9981214893122107e-05, + "loss": 0.3852, + "num_tokens": 224756295.0, + "step": 294 + }, + { + "epoch": 0.4028463828107497, + "grad_norm": 0.31661515877008606, + "learning_rate": 1.9980892522063457e-05, + "loss": 0.3884, + "num_tokens": 225558938.0, + "step": 295 + }, + { + "epoch": 0.4042119637694302, + "grad_norm": 0.2831359311396393, + "learning_rate": 1.9980567411323427e-05, + "loss": 0.3707, + "num_tokens": 226245787.0, + "step": 296 + }, + { + "epoch": 0.4055775447281107, + "grad_norm": 0.272849610936483, + "learning_rate": 1.9980239561001192e-05, + "loss": 0.4126, + "num_tokens": 227039605.0, + "step": 297 + }, + { + "epoch": 0.4069431256867912, + "grad_norm": 0.2902834845698562, + "learning_rate": 1.997990897119677e-05, + "loss": 0.3811, + "num_tokens": 227815552.0, + "step": 298 + }, + { + "epoch": 0.4083087066454717, + "grad_norm": 0.26960678467819793, + "learning_rate": 1.997957564201101e-05, + "loss": 0.3931, + "num_tokens": 228659573.0, + "step": 299 + }, + { + "epoch": 0.4096742876041522, + "grad_norm": 0.25950510396300264, + "learning_rate": 1.997923957354559e-05, + "loss": 0.3887, + "num_tokens": 229433563.0, + "step": 300 + }, + { + "epoch": 0.4110398685628327, + "grad_norm": 0.29351318438149404, + "learning_rate": 1.9978900765903037e-05, + "loss": 0.3802, + "num_tokens": 230233993.0, + "step": 301 + }, + { + "epoch": 0.41240544952151326, + "grad_norm": 0.2787262385318486, + "learning_rate": 1.9978559219186702e-05, + "loss": 0.4058, + "num_tokens": 230960817.0, + "step": 302 + }, + { + "epoch": 0.41377103048019376, + "grad_norm": 0.2650837790346735, + "learning_rate": 1.9978214933500777e-05, + "loss": 0.3963, + "num_tokens": 231745708.0, + "step": 303 + }, + { + "epoch": 0.41513661143887426, + "grad_norm": 0.27633166340896387, + "learning_rate": 1.997786790895029e-05, + "loss": 0.4047, + "num_tokens": 232542634.0, + "step": 304 + }, + { + "epoch": 0.41650219239755476, + "grad_norm": 0.2820431872850607, + "learning_rate": 1.99775181456411e-05, + "loss": 0.4083, + "num_tokens": 233333236.0, + "step": 305 + }, + { + "epoch": 0.41786777335623526, + "grad_norm": 0.26189315975193533, + "learning_rate": 1.9977165643679913e-05, + "loss": 0.3849, + "num_tokens": 234089042.0, + "step": 306 + }, + { + "epoch": 0.41923335431491576, + "grad_norm": 0.28671882177057284, + "learning_rate": 1.997681040317425e-05, + "loss": 0.4107, + "num_tokens": 234830707.0, + "step": 307 + }, + { + "epoch": 0.42059893527359626, + "grad_norm": 0.27643270280565047, + "learning_rate": 1.997645242423249e-05, + "loss": 0.3835, + "num_tokens": 235545529.0, + "step": 308 + }, + { + "epoch": 0.42196451623227677, + "grad_norm": 0.2791557944426623, + "learning_rate": 1.997609170696383e-05, + "loss": 0.3986, + "num_tokens": 236370933.0, + "step": 309 + }, + { + "epoch": 0.42333009719095727, + "grad_norm": 0.3031548819088689, + "learning_rate": 1.997572825147831e-05, + "loss": 0.4048, + "num_tokens": 237102492.0, + "step": 310 + }, + { + "epoch": 0.4246956781496378, + "grad_norm": 0.2662342314014747, + "learning_rate": 1.9975362057886812e-05, + "loss": 0.3834, + "num_tokens": 237880756.0, + "step": 311 + }, + { + "epoch": 0.4260612591083183, + "grad_norm": 0.27593203948273765, + "learning_rate": 1.9974993126301037e-05, + "loss": 0.4045, + "num_tokens": 238626595.0, + "step": 312 + }, + { + "epoch": 0.4274268400669988, + "grad_norm": 0.3344909718483932, + "learning_rate": 1.9974621456833533e-05, + "loss": 0.3893, + "num_tokens": 239367594.0, + "step": 313 + }, + { + "epoch": 0.4287924210256793, + "grad_norm": 0.27152942672280966, + "learning_rate": 1.997424704959768e-05, + "loss": 0.3942, + "num_tokens": 240116791.0, + "step": 314 + }, + { + "epoch": 0.43015800198435983, + "grad_norm": 0.35187788812986587, + "learning_rate": 1.9973869904707692e-05, + "loss": 0.4018, + "num_tokens": 240817505.0, + "step": 315 + }, + { + "epoch": 0.43152358294304033, + "grad_norm": 0.29379276078832656, + "learning_rate": 1.9973490022278624e-05, + "loss": 0.4099, + "num_tokens": 241564572.0, + "step": 316 + }, + { + "epoch": 0.43288916390172083, + "grad_norm": 0.3272274069124779, + "learning_rate": 1.9973107402426356e-05, + "loss": 0.4107, + "num_tokens": 242315939.0, + "step": 317 + }, + { + "epoch": 0.43425474486040133, + "grad_norm": 0.2799499428114526, + "learning_rate": 1.9972722045267615e-05, + "loss": 0.3942, + "num_tokens": 243121592.0, + "step": 318 + }, + { + "epoch": 0.43562032581908183, + "grad_norm": 0.3132778483317372, + "learning_rate": 1.997233395091995e-05, + "loss": 0.3893, + "num_tokens": 243893474.0, + "step": 319 + }, + { + "epoch": 0.4369859067777624, + "grad_norm": 0.2736232558207145, + "learning_rate": 1.997194311950176e-05, + "loss": 0.3676, + "num_tokens": 244667700.0, + "step": 320 + }, + { + "epoch": 0.4383514877364429, + "grad_norm": 0.31393048004474117, + "learning_rate": 1.9971549551132264e-05, + "loss": 0.4038, + "num_tokens": 245524894.0, + "step": 321 + }, + { + "epoch": 0.4397170686951234, + "grad_norm": 0.27953373187517455, + "learning_rate": 1.9971153245931525e-05, + "loss": 0.4187, + "num_tokens": 246318623.0, + "step": 322 + }, + { + "epoch": 0.4410826496538039, + "grad_norm": 0.3208878683747047, + "learning_rate": 1.9970754204020438e-05, + "loss": 0.3979, + "num_tokens": 247063112.0, + "step": 323 + }, + { + "epoch": 0.4424482306124844, + "grad_norm": 0.28182071526957775, + "learning_rate": 1.9970352425520733e-05, + "loss": 0.3803, + "num_tokens": 247731589.0, + "step": 324 + }, + { + "epoch": 0.4438138115711649, + "grad_norm": 0.31897862190934717, + "learning_rate": 1.9969947910554976e-05, + "loss": 0.3944, + "num_tokens": 248444602.0, + "step": 325 + }, + { + "epoch": 0.4451793925298454, + "grad_norm": 0.28908743458874975, + "learning_rate": 1.9969540659246568e-05, + "loss": 0.393, + "num_tokens": 249230640.0, + "step": 326 + }, + { + "epoch": 0.4465449734885259, + "grad_norm": 0.2760586145185695, + "learning_rate": 1.9969130671719743e-05, + "loss": 0.3989, + "num_tokens": 249979267.0, + "step": 327 + }, + { + "epoch": 0.4479105544472064, + "grad_norm": 0.30622359342047717, + "learning_rate": 1.9968717948099567e-05, + "loss": 0.3879, + "num_tokens": 250713432.0, + "step": 328 + }, + { + "epoch": 0.44927613540588696, + "grad_norm": 0.2891241067300086, + "learning_rate": 1.996830248851195e-05, + "loss": 0.4007, + "num_tokens": 251499076.0, + "step": 329 + }, + { + "epoch": 0.45064171636456746, + "grad_norm": 0.25680471264854765, + "learning_rate": 1.9967884293083627e-05, + "loss": 0.3704, + "num_tokens": 252294504.0, + "step": 330 + }, + { + "epoch": 0.45200729732324796, + "grad_norm": 0.27342123129651363, + "learning_rate": 1.9967463361942173e-05, + "loss": 0.4062, + "num_tokens": 253098070.0, + "step": 331 + }, + { + "epoch": 0.45337287828192846, + "grad_norm": 0.2638329152757139, + "learning_rate": 1.9967039695215994e-05, + "loss": 0.3846, + "num_tokens": 253773906.0, + "step": 332 + }, + { + "epoch": 0.45473845924060896, + "grad_norm": 0.2657793566799468, + "learning_rate": 1.9966613293034333e-05, + "loss": 0.3918, + "num_tokens": 254497202.0, + "step": 333 + }, + { + "epoch": 0.45610404019928946, + "grad_norm": 0.28704836815028045, + "learning_rate": 1.9966184155527272e-05, + "loss": 0.3872, + "num_tokens": 255311898.0, + "step": 334 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 0.2513708559824376, + "learning_rate": 1.9965752282825714e-05, + "loss": 0.4045, + "num_tokens": 256101427.0, + "step": 335 + }, + { + "epoch": 0.45883520211665046, + "grad_norm": 0.29544118773578576, + "learning_rate": 1.9965317675061403e-05, + "loss": 0.3803, + "num_tokens": 256854702.0, + "step": 336 + }, + { + "epoch": 0.46020078307533097, + "grad_norm": 0.30074858818631434, + "learning_rate": 1.996488033236693e-05, + "loss": 0.3926, + "num_tokens": 257665665.0, + "step": 337 + }, + { + "epoch": 0.4615663640340115, + "grad_norm": 0.2774812609610397, + "learning_rate": 1.9964440254875705e-05, + "loss": 0.3756, + "num_tokens": 258421412.0, + "step": 338 + }, + { + "epoch": 0.462931944992692, + "grad_norm": 0.36005669021995146, + "learning_rate": 1.9963997442721972e-05, + "loss": 0.393, + "num_tokens": 259281491.0, + "step": 339 + }, + { + "epoch": 0.4642975259513725, + "grad_norm": 0.32279468577531245, + "learning_rate": 1.996355189604082e-05, + "loss": 0.4021, + "num_tokens": 260132318.0, + "step": 340 + }, + { + "epoch": 0.465663106910053, + "grad_norm": 0.27780022276604843, + "learning_rate": 1.996310361496816e-05, + "loss": 0.3821, + "num_tokens": 260859367.0, + "step": 341 + }, + { + "epoch": 0.4670286878687335, + "grad_norm": 0.30291829349542976, + "learning_rate": 1.996265259964075e-05, + "loss": 0.3815, + "num_tokens": 261592897.0, + "step": 342 + }, + { + "epoch": 0.46839426882741403, + "grad_norm": 0.2876814464513085, + "learning_rate": 1.9962198850196166e-05, + "loss": 0.3783, + "num_tokens": 262343138.0, + "step": 343 + }, + { + "epoch": 0.46975984978609453, + "grad_norm": 0.32155905506701327, + "learning_rate": 1.9961742366772832e-05, + "loss": 0.3776, + "num_tokens": 263134383.0, + "step": 344 + }, + { + "epoch": 0.47112543074477503, + "grad_norm": 0.2954037842357005, + "learning_rate": 1.9961283149510007e-05, + "loss": 0.4046, + "num_tokens": 263964936.0, + "step": 345 + }, + { + "epoch": 0.47249101170345553, + "grad_norm": 0.2557493428932082, + "learning_rate": 1.996082119854777e-05, + "loss": 0.3872, + "num_tokens": 264686760.0, + "step": 346 + }, + { + "epoch": 0.4738565926621361, + "grad_norm": 0.3154717167445236, + "learning_rate": 1.996035651402705e-05, + "loss": 0.3961, + "num_tokens": 265443028.0, + "step": 347 + }, + { + "epoch": 0.4752221736208166, + "grad_norm": 0.29346388965250597, + "learning_rate": 1.9959889096089594e-05, + "loss": 0.4049, + "num_tokens": 266231883.0, + "step": 348 + }, + { + "epoch": 0.4765877545794971, + "grad_norm": 0.2795639359573214, + "learning_rate": 1.9959418944877992e-05, + "loss": 0.3776, + "num_tokens": 267006260.0, + "step": 349 + }, + { + "epoch": 0.4779533355381776, + "grad_norm": 0.28019482619663694, + "learning_rate": 1.9958946060535675e-05, + "loss": 0.3934, + "num_tokens": 267769431.0, + "step": 350 + }, + { + "epoch": 0.4793189164968581, + "grad_norm": 0.3286925047098573, + "learning_rate": 1.9958470443206886e-05, + "loss": 0.4057, + "num_tokens": 268531278.0, + "step": 351 + }, + { + "epoch": 0.4806844974555386, + "grad_norm": 0.271067937301479, + "learning_rate": 1.9957992093036733e-05, + "loss": 0.3981, + "num_tokens": 269329197.0, + "step": 352 + }, + { + "epoch": 0.4820500784142191, + "grad_norm": 0.2793519046266766, + "learning_rate": 1.9957511010171124e-05, + "loss": 0.3948, + "num_tokens": 270106226.0, + "step": 353 + }, + { + "epoch": 0.4834156593728996, + "grad_norm": 0.3101396768130941, + "learning_rate": 1.9957027194756825e-05, + "loss": 0.4024, + "num_tokens": 270869973.0, + "step": 354 + }, + { + "epoch": 0.4847812403315801, + "grad_norm": 0.24183674814782896, + "learning_rate": 1.995654064694142e-05, + "loss": 0.3987, + "num_tokens": 271641818.0, + "step": 355 + }, + { + "epoch": 0.48614682129026066, + "grad_norm": 0.336563986256493, + "learning_rate": 1.9956051366873344e-05, + "loss": 0.4046, + "num_tokens": 272466578.0, + "step": 356 + }, + { + "epoch": 0.48751240224894116, + "grad_norm": 0.2850997465618145, + "learning_rate": 1.9955559354701847e-05, + "loss": 0.401, + "num_tokens": 273263173.0, + "step": 357 + }, + { + "epoch": 0.48887798320762166, + "grad_norm": 0.341007592208121, + "learning_rate": 1.9955064610577025e-05, + "loss": 0.3991, + "num_tokens": 274072129.0, + "step": 358 + }, + { + "epoch": 0.49024356416630216, + "grad_norm": 0.3034384213011678, + "learning_rate": 1.9954567134649802e-05, + "loss": 0.3929, + "num_tokens": 274788487.0, + "step": 359 + }, + { + "epoch": 0.49160914512498266, + "grad_norm": 0.34700266089629217, + "learning_rate": 1.9954066927071933e-05, + "loss": 0.4025, + "num_tokens": 275517372.0, + "step": 360 + }, + { + "epoch": 0.49297472608366316, + "grad_norm": 0.24559917447059515, + "learning_rate": 1.995356398799601e-05, + "loss": 0.3756, + "num_tokens": 276221573.0, + "step": 361 + }, + { + "epoch": 0.49434030704234366, + "grad_norm": 0.29820127258741186, + "learning_rate": 1.9953058317575466e-05, + "loss": 0.3716, + "num_tokens": 276933000.0, + "step": 362 + }, + { + "epoch": 0.49570588800102416, + "grad_norm": 0.29464656103684145, + "learning_rate": 1.995254991596455e-05, + "loss": 0.3908, + "num_tokens": 277707476.0, + "step": 363 + }, + { + "epoch": 0.49707146895970467, + "grad_norm": 0.3108825389264355, + "learning_rate": 1.9952038783318355e-05, + "loss": 0.3873, + "num_tokens": 278432728.0, + "step": 364 + }, + { + "epoch": 0.4984370499183852, + "grad_norm": 0.2695543241455866, + "learning_rate": 1.995152491979281e-05, + "loss": 0.3964, + "num_tokens": 279161715.0, + "step": 365 + }, + { + "epoch": 0.4998026308770657, + "grad_norm": 0.26050177831941734, + "learning_rate": 1.995100832554467e-05, + "loss": 0.3848, + "num_tokens": 279998957.0, + "step": 366 + }, + { + "epoch": 0.5011682118357462, + "grad_norm": 0.26686976133695306, + "learning_rate": 1.9950489000731523e-05, + "loss": 0.3806, + "num_tokens": 280724323.0, + "step": 367 + }, + { + "epoch": 0.5025337927944267, + "grad_norm": 0.2784986859560697, + "learning_rate": 1.9949966945511797e-05, + "loss": 0.3835, + "num_tokens": 281517033.0, + "step": 368 + }, + { + "epoch": 0.5038993737531072, + "grad_norm": 0.2758146927110324, + "learning_rate": 1.994944216004474e-05, + "loss": 0.4011, + "num_tokens": 282370345.0, + "step": 369 + }, + { + "epoch": 0.5052649547117878, + "grad_norm": 0.2698387260904606, + "learning_rate": 1.9948914644490456e-05, + "loss": 0.3784, + "num_tokens": 283115713.0, + "step": 370 + }, + { + "epoch": 0.5066305356704682, + "grad_norm": 0.27838457135525596, + "learning_rate": 1.9948384399009852e-05, + "loss": 0.3679, + "num_tokens": 283864860.0, + "step": 371 + }, + { + "epoch": 0.5079961166291488, + "grad_norm": 0.2727942574219679, + "learning_rate": 1.9947851423764693e-05, + "loss": 0.3941, + "num_tokens": 284665733.0, + "step": 372 + }, + { + "epoch": 0.5093616975878292, + "grad_norm": 0.2573338521711438, + "learning_rate": 1.994731571891756e-05, + "loss": 0.4076, + "num_tokens": 285469958.0, + "step": 373 + }, + { + "epoch": 0.5107272785465098, + "grad_norm": 0.2863594498607459, + "learning_rate": 1.9946777284631877e-05, + "loss": 0.3828, + "num_tokens": 286222903.0, + "step": 374 + }, + { + "epoch": 0.5120928595051902, + "grad_norm": 0.2989772798339145, + "learning_rate": 1.99462361210719e-05, + "loss": 0.4123, + "num_tokens": 287019550.0, + "step": 375 + }, + { + "epoch": 0.5134584404638708, + "grad_norm": 0.24819277826666683, + "learning_rate": 1.9945692228402708e-05, + "loss": 0.3713, + "num_tokens": 287752277.0, + "step": 376 + }, + { + "epoch": 0.5148240214225512, + "grad_norm": 0.3151599320480343, + "learning_rate": 1.9945145606790222e-05, + "loss": 0.3725, + "num_tokens": 288496766.0, + "step": 377 + }, + { + "epoch": 0.5161896023812318, + "grad_norm": 0.27129002688225495, + "learning_rate": 1.9944596256401198e-05, + "loss": 0.3949, + "num_tokens": 289277216.0, + "step": 378 + }, + { + "epoch": 0.5175551833399124, + "grad_norm": 0.37934420516247214, + "learning_rate": 1.9944044177403205e-05, + "loss": 0.387, + "num_tokens": 290000884.0, + "step": 379 + }, + { + "epoch": 0.5189207642985928, + "grad_norm": 0.3090685932126554, + "learning_rate": 1.9943489369964672e-05, + "loss": 0.3952, + "num_tokens": 290694042.0, + "step": 380 + }, + { + "epoch": 0.5202863452572734, + "grad_norm": 0.28055184622500373, + "learning_rate": 1.994293183425484e-05, + "loss": 0.4098, + "num_tokens": 291467310.0, + "step": 381 + }, + { + "epoch": 0.5216519262159538, + "grad_norm": 0.2558143703975956, + "learning_rate": 1.9942371570443793e-05, + "loss": 0.3811, + "num_tokens": 292263453.0, + "step": 382 + }, + { + "epoch": 0.5230175071746344, + "grad_norm": 0.28700899851986844, + "learning_rate": 1.9941808578702438e-05, + "loss": 0.3936, + "num_tokens": 293045342.0, + "step": 383 + }, + { + "epoch": 0.5243830881333148, + "grad_norm": 0.2701965433820782, + "learning_rate": 1.9941242859202527e-05, + "loss": 0.3855, + "num_tokens": 293752202.0, + "step": 384 + }, + { + "epoch": 0.5257486690919954, + "grad_norm": 0.2539671988876104, + "learning_rate": 1.994067441211663e-05, + "loss": 0.3772, + "num_tokens": 294509326.0, + "step": 385 + }, + { + "epoch": 0.5271142500506758, + "grad_norm": 0.2722733623938688, + "learning_rate": 1.9940103237618153e-05, + "loss": 0.3986, + "num_tokens": 295286096.0, + "step": 386 + }, + { + "epoch": 0.5284798310093564, + "grad_norm": 0.2696900803864628, + "learning_rate": 1.9939529335881348e-05, + "loss": 0.3779, + "num_tokens": 296024065.0, + "step": 387 + }, + { + "epoch": 0.5298454119680369, + "grad_norm": 0.24862256946366604, + "learning_rate": 1.9938952707081277e-05, + "loss": 0.3992, + "num_tokens": 296814855.0, + "step": 388 + }, + { + "epoch": 0.5312109929267174, + "grad_norm": 0.23943991538031187, + "learning_rate": 1.9938373351393846e-05, + "loss": 0.3865, + "num_tokens": 297601966.0, + "step": 389 + }, + { + "epoch": 0.5325765738853979, + "grad_norm": 0.27228591211162795, + "learning_rate": 1.99377912689958e-05, + "loss": 0.393, + "num_tokens": 298298993.0, + "step": 390 + }, + { + "epoch": 0.5339421548440784, + "grad_norm": 0.23819815267255628, + "learning_rate": 1.9937206460064698e-05, + "loss": 0.3922, + "num_tokens": 299084362.0, + "step": 391 + }, + { + "epoch": 0.5353077358027589, + "grad_norm": 0.2510574264021747, + "learning_rate": 1.993661892477894e-05, + "loss": 0.3864, + "num_tokens": 299866430.0, + "step": 392 + }, + { + "epoch": 0.5366733167614394, + "grad_norm": 0.2523478907599862, + "learning_rate": 1.9936028663317762e-05, + "loss": 0.3831, + "num_tokens": 300572297.0, + "step": 393 + }, + { + "epoch": 0.5380388977201199, + "grad_norm": 0.272754024461173, + "learning_rate": 1.9935435675861227e-05, + "loss": 0.3742, + "num_tokens": 301280163.0, + "step": 394 + }, + { + "epoch": 0.5394044786788004, + "grad_norm": 0.27892282948839753, + "learning_rate": 1.9934839962590224e-05, + "loss": 0.3674, + "num_tokens": 302007061.0, + "step": 395 + }, + { + "epoch": 0.5407700596374809, + "grad_norm": 0.2767111228811789, + "learning_rate": 1.9934241523686487e-05, + "loss": 0.3911, + "num_tokens": 302793860.0, + "step": 396 + }, + { + "epoch": 0.5421356405961615, + "grad_norm": 0.27940877557372473, + "learning_rate": 1.993364035933257e-05, + "loss": 0.4072, + "num_tokens": 303539228.0, + "step": 397 + }, + { + "epoch": 0.5435012215548419, + "grad_norm": 0.2692479564607352, + "learning_rate": 1.993303646971186e-05, + "loss": 0.3853, + "num_tokens": 304271393.0, + "step": 398 + }, + { + "epoch": 0.5448668025135225, + "grad_norm": 0.25916129072101557, + "learning_rate": 1.993242985500858e-05, + "loss": 0.3981, + "num_tokens": 305007034.0, + "step": 399 + }, + { + "epoch": 0.5462323834722029, + "grad_norm": 0.3091265460204507, + "learning_rate": 1.9931820515407784e-05, + "loss": 0.388, + "num_tokens": 305738401.0, + "step": 400 + }, + { + "epoch": 0.5475979644308835, + "grad_norm": 0.27150239937933734, + "learning_rate": 1.9931208451095353e-05, + "loss": 0.3994, + "num_tokens": 306532200.0, + "step": 401 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3194026900410218, + "learning_rate": 1.9930593662258e-05, + "loss": 0.407, + "num_tokens": 307244307.0, + "step": 402 + }, + { + "epoch": 0.5503291263482445, + "grad_norm": 0.2650284159872338, + "learning_rate": 1.9929976149083272e-05, + "loss": 0.3992, + "num_tokens": 308006043.0, + "step": 403 + }, + { + "epoch": 0.5516947073069249, + "grad_norm": 0.25728837792031095, + "learning_rate": 1.9929355911759545e-05, + "loss": 0.3824, + "num_tokens": 308785558.0, + "step": 404 + }, + { + "epoch": 0.5530602882656055, + "grad_norm": 0.2572433565874476, + "learning_rate": 1.992873295047603e-05, + "loss": 0.3829, + "num_tokens": 309553286.0, + "step": 405 + }, + { + "epoch": 0.554425869224286, + "grad_norm": 0.2650468920377569, + "learning_rate": 1.9928107265422757e-05, + "loss": 0.376, + "num_tokens": 310326933.0, + "step": 406 + }, + { + "epoch": 0.5557914501829665, + "grad_norm": 0.2624003719990711, + "learning_rate": 1.9927478856790606e-05, + "loss": 0.4008, + "num_tokens": 311116677.0, + "step": 407 + }, + { + "epoch": 0.557157031141647, + "grad_norm": 0.3375733066147591, + "learning_rate": 1.992684772477127e-05, + "loss": 0.3759, + "num_tokens": 311856460.0, + "step": 408 + }, + { + "epoch": 0.5585226121003275, + "grad_norm": 0.28472978510710845, + "learning_rate": 1.992621386955728e-05, + "loss": 0.3907, + "num_tokens": 312526419.0, + "step": 409 + }, + { + "epoch": 0.559888193059008, + "grad_norm": 0.23641115855611305, + "learning_rate": 1.9925577291342005e-05, + "loss": 0.3824, + "num_tokens": 313315050.0, + "step": 410 + }, + { + "epoch": 0.5612537740176885, + "grad_norm": 0.26283645892730956, + "learning_rate": 1.9924937990319627e-05, + "loss": 0.3902, + "num_tokens": 314118654.0, + "step": 411 + }, + { + "epoch": 0.5626193549763691, + "grad_norm": 0.23665553287273927, + "learning_rate": 1.9924295966685175e-05, + "loss": 0.3784, + "num_tokens": 314876435.0, + "step": 412 + }, + { + "epoch": 0.5639849359350495, + "grad_norm": 0.26426203107354346, + "learning_rate": 1.9923651220634505e-05, + "loss": 0.3986, + "num_tokens": 315647483.0, + "step": 413 + }, + { + "epoch": 0.5653505168937301, + "grad_norm": 0.25774901326287525, + "learning_rate": 1.9923003752364297e-05, + "loss": 0.3939, + "num_tokens": 316454197.0, + "step": 414 + }, + { + "epoch": 0.5667160978524106, + "grad_norm": 0.2768510430148426, + "learning_rate": 1.9922353562072062e-05, + "loss": 0.381, + "num_tokens": 317113360.0, + "step": 415 + }, + { + "epoch": 0.5680816788110911, + "grad_norm": 0.2618735623570518, + "learning_rate": 1.9921700649956156e-05, + "loss": 0.3916, + "num_tokens": 317893602.0, + "step": 416 + }, + { + "epoch": 0.5694472597697716, + "grad_norm": 0.23339769271155472, + "learning_rate": 1.9921045016215745e-05, + "loss": 0.3957, + "num_tokens": 318737749.0, + "step": 417 + }, + { + "epoch": 0.5708128407284521, + "grad_norm": 0.2696881913596961, + "learning_rate": 1.992038666105084e-05, + "loss": 0.3673, + "num_tokens": 319398186.0, + "step": 418 + }, + { + "epoch": 0.5721784216871326, + "grad_norm": 0.2534714084698837, + "learning_rate": 1.991972558466227e-05, + "loss": 0.4052, + "num_tokens": 320235448.0, + "step": 419 + }, + { + "epoch": 0.5735440026458131, + "grad_norm": 0.3020992437635151, + "learning_rate": 1.9919061787251703e-05, + "loss": 0.3798, + "num_tokens": 320994453.0, + "step": 420 + }, + { + "epoch": 0.5749095836044936, + "grad_norm": 0.24054166227400872, + "learning_rate": 1.991839526902164e-05, + "loss": 0.3712, + "num_tokens": 321722682.0, + "step": 421 + }, + { + "epoch": 0.5762751645631741, + "grad_norm": 0.29600481633822495, + "learning_rate": 1.99177260301754e-05, + "loss": 0.3874, + "num_tokens": 322491853.0, + "step": 422 + }, + { + "epoch": 0.5776407455218546, + "grad_norm": 0.30018586208778225, + "learning_rate": 1.991705407091715e-05, + "loss": 0.3839, + "num_tokens": 323230647.0, + "step": 423 + }, + { + "epoch": 0.5790063264805352, + "grad_norm": 0.2803108550097222, + "learning_rate": 1.991637939145186e-05, + "loss": 0.4046, + "num_tokens": 323998089.0, + "step": 424 + }, + { + "epoch": 0.5803719074392156, + "grad_norm": 0.26754754121716146, + "learning_rate": 1.9915701991985356e-05, + "loss": 0.375, + "num_tokens": 324679142.0, + "step": 425 + }, + { + "epoch": 0.5817374883978962, + "grad_norm": 0.25236905102013724, + "learning_rate": 1.991502187272428e-05, + "loss": 0.3669, + "num_tokens": 325360054.0, + "step": 426 + }, + { + "epoch": 0.5831030693565766, + "grad_norm": 0.2681885553058281, + "learning_rate": 1.9914339033876107e-05, + "loss": 0.4014, + "num_tokens": 326095180.0, + "step": 427 + }, + { + "epoch": 0.5844686503152572, + "grad_norm": 0.25280579864144903, + "learning_rate": 1.991365347564914e-05, + "loss": 0.3909, + "num_tokens": 326905415.0, + "step": 428 + }, + { + "epoch": 0.5858342312739376, + "grad_norm": 0.2747701278453264, + "learning_rate": 1.9912965198252518e-05, + "loss": 0.3967, + "num_tokens": 327711008.0, + "step": 429 + }, + { + "epoch": 0.5871998122326182, + "grad_norm": 0.25525879442005694, + "learning_rate": 1.9912274201896203e-05, + "loss": 0.3816, + "num_tokens": 328403684.0, + "step": 430 + }, + { + "epoch": 0.5885653931912986, + "grad_norm": 0.2643107138017229, + "learning_rate": 1.9911580486790983e-05, + "loss": 0.3973, + "num_tokens": 329242640.0, + "step": 431 + }, + { + "epoch": 0.5899309741499792, + "grad_norm": 0.256201704365424, + "learning_rate": 1.9910884053148487e-05, + "loss": 0.3746, + "num_tokens": 329985268.0, + "step": 432 + }, + { + "epoch": 0.5912965551086597, + "grad_norm": 0.24903051883371624, + "learning_rate": 1.9910184901181162e-05, + "loss": 0.3802, + "num_tokens": 330815185.0, + "step": 433 + }, + { + "epoch": 0.5926621360673402, + "grad_norm": 0.24385500266499738, + "learning_rate": 1.990948303110229e-05, + "loss": 0.3582, + "num_tokens": 331567995.0, + "step": 434 + }, + { + "epoch": 0.5940277170260208, + "grad_norm": 0.24529074527901873, + "learning_rate": 1.9908778443125986e-05, + "loss": 0.3956, + "num_tokens": 332334214.0, + "step": 435 + }, + { + "epoch": 0.5953932979847012, + "grad_norm": 0.2832748422070591, + "learning_rate": 1.9908071137467183e-05, + "loss": 0.3901, + "num_tokens": 333131199.0, + "step": 436 + }, + { + "epoch": 0.5967588789433818, + "grad_norm": 0.24354259019370247, + "learning_rate": 1.9907361114341654e-05, + "loss": 0.3805, + "num_tokens": 333900299.0, + "step": 437 + }, + { + "epoch": 0.5981244599020622, + "grad_norm": 0.2572670626553479, + "learning_rate": 1.990664837396599e-05, + "loss": 0.3828, + "num_tokens": 334642801.0, + "step": 438 + }, + { + "epoch": 0.5994900408607428, + "grad_norm": 0.2777162274871795, + "learning_rate": 1.9905932916557624e-05, + "loss": 0.3736, + "num_tokens": 335423971.0, + "step": 439 + }, + { + "epoch": 0.6008556218194232, + "grad_norm": 0.23616167881302969, + "learning_rate": 1.990521474233481e-05, + "loss": 0.3585, + "num_tokens": 336156368.0, + "step": 440 + }, + { + "epoch": 0.6022212027781038, + "grad_norm": 0.24672370189004839, + "learning_rate": 1.9904493851516628e-05, + "loss": 0.37, + "num_tokens": 336842360.0, + "step": 441 + }, + { + "epoch": 0.6035867837367843, + "grad_norm": 0.2567303053916664, + "learning_rate": 1.9903770244322993e-05, + "loss": 0.3968, + "num_tokens": 337607796.0, + "step": 442 + }, + { + "epoch": 0.6049523646954648, + "grad_norm": 0.25557272386307955, + "learning_rate": 1.9903043920974644e-05, + "loss": 0.403, + "num_tokens": 338378413.0, + "step": 443 + }, + { + "epoch": 0.6063179456541453, + "grad_norm": 0.23869729520466798, + "learning_rate": 1.9902314881693155e-05, + "loss": 0.3872, + "num_tokens": 339163790.0, + "step": 444 + }, + { + "epoch": 0.6076835266128258, + "grad_norm": 0.258158730017596, + "learning_rate": 1.9901583126700922e-05, + "loss": 0.3833, + "num_tokens": 339913682.0, + "step": 445 + }, + { + "epoch": 0.6090491075715063, + "grad_norm": 0.2785739301599366, + "learning_rate": 1.9900848656221173e-05, + "loss": 0.4001, + "num_tokens": 340667280.0, + "step": 446 + }, + { + "epoch": 0.6104146885301868, + "grad_norm": 0.27475036069638153, + "learning_rate": 1.990011147047796e-05, + "loss": 0.3947, + "num_tokens": 341402387.0, + "step": 447 + }, + { + "epoch": 0.6117802694888673, + "grad_norm": 0.258511209003639, + "learning_rate": 1.9899371569696168e-05, + "loss": 0.3737, + "num_tokens": 342126377.0, + "step": 448 + }, + { + "epoch": 0.6131458504475478, + "grad_norm": 0.28808180786894944, + "learning_rate": 1.989862895410151e-05, + "loss": 0.3681, + "num_tokens": 342944624.0, + "step": 449 + }, + { + "epoch": 0.6145114314062283, + "grad_norm": 0.2709620428767766, + "learning_rate": 1.9897883623920523e-05, + "loss": 0.3958, + "num_tokens": 343793644.0, + "step": 450 + }, + { + "epoch": 0.6158770123649089, + "grad_norm": 0.24149618865623834, + "learning_rate": 1.989713557938058e-05, + "loss": 0.3844, + "num_tokens": 344553381.0, + "step": 451 + }, + { + "epoch": 0.6172425933235893, + "grad_norm": 0.2838899023279151, + "learning_rate": 1.989638482070987e-05, + "loss": 0.4093, + "num_tokens": 345386637.0, + "step": 452 + }, + { + "epoch": 0.6186081742822699, + "grad_norm": 0.2737458195788977, + "learning_rate": 1.9895631348137427e-05, + "loss": 0.3931, + "num_tokens": 346134370.0, + "step": 453 + }, + { + "epoch": 0.6199737552409503, + "grad_norm": 0.22723145417246904, + "learning_rate": 1.9894875161893092e-05, + "loss": 0.3764, + "num_tokens": 346892468.0, + "step": 454 + }, + { + "epoch": 0.6213393361996309, + "grad_norm": 0.2491686892612959, + "learning_rate": 1.9894116262207553e-05, + "loss": 0.3875, + "num_tokens": 347629735.0, + "step": 455 + }, + { + "epoch": 0.6227049171583113, + "grad_norm": 0.2471805251070627, + "learning_rate": 1.989335464931231e-05, + "loss": 0.4011, + "num_tokens": 348412290.0, + "step": 456 + }, + { + "epoch": 0.6240704981169919, + "grad_norm": 0.25489337619836044, + "learning_rate": 1.9892590323439703e-05, + "loss": 0.3828, + "num_tokens": 349089958.0, + "step": 457 + }, + { + "epoch": 0.6254360790756723, + "grad_norm": 0.22881235723740262, + "learning_rate": 1.9891823284822893e-05, + "loss": 0.3792, + "num_tokens": 349823834.0, + "step": 458 + }, + { + "epoch": 0.6268016600343529, + "grad_norm": 0.23315263734184938, + "learning_rate": 1.9891053533695875e-05, + "loss": 0.3966, + "num_tokens": 350632107.0, + "step": 459 + }, + { + "epoch": 0.6281672409930334, + "grad_norm": 0.23079513648155103, + "learning_rate": 1.989028107029346e-05, + "loss": 0.362, + "num_tokens": 351357486.0, + "step": 460 + }, + { + "epoch": 0.6295328219517139, + "grad_norm": 0.2827631709395634, + "learning_rate": 1.9889505894851295e-05, + "loss": 0.3816, + "num_tokens": 352152235.0, + "step": 461 + }, + { + "epoch": 0.6308984029103945, + "grad_norm": 0.23472627965073545, + "learning_rate": 1.9888728007605856e-05, + "loss": 0.3866, + "num_tokens": 352899563.0, + "step": 462 + }, + { + "epoch": 0.6322639838690749, + "grad_norm": 0.2763922513968407, + "learning_rate": 1.9887947408794436e-05, + "loss": 0.386, + "num_tokens": 353641255.0, + "step": 463 + }, + { + "epoch": 0.6336295648277555, + "grad_norm": 0.2325647245087109, + "learning_rate": 1.9887164098655167e-05, + "loss": 0.3747, + "num_tokens": 354413544.0, + "step": 464 + }, + { + "epoch": 0.6349951457864359, + "grad_norm": 0.2659446044254964, + "learning_rate": 1.9886378077427004e-05, + "loss": 0.3834, + "num_tokens": 355172169.0, + "step": 465 + }, + { + "epoch": 0.6363607267451165, + "grad_norm": 0.24388278827787876, + "learning_rate": 1.988558934534972e-05, + "loss": 0.3722, + "num_tokens": 355974796.0, + "step": 466 + }, + { + "epoch": 0.6377263077037969, + "grad_norm": 0.25243421298958635, + "learning_rate": 1.9884797902663935e-05, + "loss": 0.392, + "num_tokens": 356653202.0, + "step": 467 + }, + { + "epoch": 0.6390918886624775, + "grad_norm": 0.2626999112356006, + "learning_rate": 1.9884003749611076e-05, + "loss": 0.4143, + "num_tokens": 357453524.0, + "step": 468 + }, + { + "epoch": 0.640457469621158, + "grad_norm": 0.25183527749776646, + "learning_rate": 1.988320688643341e-05, + "loss": 0.3994, + "num_tokens": 358205430.0, + "step": 469 + }, + { + "epoch": 0.6418230505798385, + "grad_norm": 0.27717369348191173, + "learning_rate": 1.988240731337402e-05, + "loss": 0.3932, + "num_tokens": 358969175.0, + "step": 470 + }, + { + "epoch": 0.643188631538519, + "grad_norm": 0.22146088432606734, + "learning_rate": 1.9881605030676816e-05, + "loss": 0.3621, + "num_tokens": 359732045.0, + "step": 471 + }, + { + "epoch": 0.6445542124971995, + "grad_norm": 0.27802956769561044, + "learning_rate": 1.9880800038586553e-05, + "loss": 0.3833, + "num_tokens": 360503104.0, + "step": 472 + }, + { + "epoch": 0.64591979345588, + "grad_norm": 0.2733803995679551, + "learning_rate": 1.9879992337348792e-05, + "loss": 0.3796, + "num_tokens": 361190644.0, + "step": 473 + }, + { + "epoch": 0.6472853744145605, + "grad_norm": 0.2810144410561898, + "learning_rate": 1.987918192720993e-05, + "loss": 0.3873, + "num_tokens": 361964681.0, + "step": 474 + }, + { + "epoch": 0.648650955373241, + "grad_norm": 0.2265414791985924, + "learning_rate": 1.9878368808417185e-05, + "loss": 0.3795, + "num_tokens": 362712524.0, + "step": 475 + }, + { + "epoch": 0.6500165363319215, + "grad_norm": 0.3060506099122097, + "learning_rate": 1.9877552981218605e-05, + "loss": 0.3947, + "num_tokens": 363444809.0, + "step": 476 + }, + { + "epoch": 0.651382117290602, + "grad_norm": 0.23865118502594165, + "learning_rate": 1.9876734445863065e-05, + "loss": 0.3824, + "num_tokens": 364223553.0, + "step": 477 + }, + { + "epoch": 0.6527476982492826, + "grad_norm": 0.28593938491522236, + "learning_rate": 1.987591320260027e-05, + "loss": 0.3716, + "num_tokens": 364930413.0, + "step": 478 + }, + { + "epoch": 0.654113279207963, + "grad_norm": 0.2242331191091773, + "learning_rate": 1.9875089251680735e-05, + "loss": 0.3774, + "num_tokens": 365627313.0, + "step": 479 + }, + { + "epoch": 0.6554788601666436, + "grad_norm": 0.2732859231465138, + "learning_rate": 1.9874262593355815e-05, + "loss": 0.3632, + "num_tokens": 366390560.0, + "step": 480 + }, + { + "epoch": 0.656844441125324, + "grad_norm": 0.22229610323101276, + "learning_rate": 1.9873433227877693e-05, + "loss": 0.3824, + "num_tokens": 367203785.0, + "step": 481 + }, + { + "epoch": 0.6582100220840046, + "grad_norm": 0.283970200130262, + "learning_rate": 1.987260115549937e-05, + "loss": 0.3982, + "num_tokens": 367972346.0, + "step": 482 + }, + { + "epoch": 0.659575603042685, + "grad_norm": 0.26638015326160996, + "learning_rate": 1.9871766376474668e-05, + "loss": 0.3933, + "num_tokens": 368704810.0, + "step": 483 + }, + { + "epoch": 0.6609411840013656, + "grad_norm": 0.2826046729667879, + "learning_rate": 1.9870928891058253e-05, + "loss": 0.3922, + "num_tokens": 369463250.0, + "step": 484 + }, + { + "epoch": 0.662306764960046, + "grad_norm": 0.25397644275180864, + "learning_rate": 1.98700886995056e-05, + "loss": 0.3831, + "num_tokens": 370240745.0, + "step": 485 + }, + { + "epoch": 0.6636723459187266, + "grad_norm": 0.23163295447995425, + "learning_rate": 1.9869245802073014e-05, + "loss": 0.3642, + "num_tokens": 370918463.0, + "step": 486 + }, + { + "epoch": 0.6650379268774071, + "grad_norm": 0.23269281096405695, + "learning_rate": 1.986840019901763e-05, + "loss": 0.3706, + "num_tokens": 371669825.0, + "step": 487 + }, + { + "epoch": 0.6664035078360876, + "grad_norm": 0.21497843188130977, + "learning_rate": 1.9867551890597402e-05, + "loss": 0.3746, + "num_tokens": 372455157.0, + "step": 488 + }, + { + "epoch": 0.6677690887947682, + "grad_norm": 0.21332532471343174, + "learning_rate": 1.986670087707111e-05, + "loss": 0.3915, + "num_tokens": 373292474.0, + "step": 489 + }, + { + "epoch": 0.6691346697534486, + "grad_norm": 0.21758228015114192, + "learning_rate": 1.9865847158698373e-05, + "loss": 0.3833, + "num_tokens": 374081092.0, + "step": 490 + }, + { + "epoch": 0.6705002507121292, + "grad_norm": 0.22771642866648495, + "learning_rate": 1.9864990735739607e-05, + "loss": 0.3798, + "num_tokens": 374866303.0, + "step": 491 + }, + { + "epoch": 0.6718658316708096, + "grad_norm": 0.27961910298625503, + "learning_rate": 1.9864131608456082e-05, + "loss": 0.3697, + "num_tokens": 375564177.0, + "step": 492 + }, + { + "epoch": 0.6732314126294902, + "grad_norm": 0.23494724323530727, + "learning_rate": 1.9863269777109875e-05, + "loss": 0.3643, + "num_tokens": 376352567.0, + "step": 493 + }, + { + "epoch": 0.6745969935881706, + "grad_norm": 0.2690806115631594, + "learning_rate": 1.9862405241963894e-05, + "loss": 0.3628, + "num_tokens": 377105995.0, + "step": 494 + }, + { + "epoch": 0.6759625745468512, + "grad_norm": 0.26966208227775673, + "learning_rate": 1.986153800328187e-05, + "loss": 0.3953, + "num_tokens": 377839271.0, + "step": 495 + }, + { + "epoch": 0.6773281555055317, + "grad_norm": 0.27724799806112765, + "learning_rate": 1.9860668061328364e-05, + "loss": 0.3781, + "num_tokens": 378614568.0, + "step": 496 + }, + { + "epoch": 0.6786937364642122, + "grad_norm": 0.2435848556850448, + "learning_rate": 1.9859795416368758e-05, + "loss": 0.3504, + "num_tokens": 379355838.0, + "step": 497 + }, + { + "epoch": 0.6800593174228927, + "grad_norm": 0.27485323256623506, + "learning_rate": 1.985892006866925e-05, + "loss": 0.3855, + "num_tokens": 380106924.0, + "step": 498 + }, + { + "epoch": 0.6814248983815732, + "grad_norm": 0.25543341775917644, + "learning_rate": 1.9858042018496882e-05, + "loss": 0.4035, + "num_tokens": 380873032.0, + "step": 499 + }, + { + "epoch": 0.6827904793402537, + "grad_norm": 0.23291356363517804, + "learning_rate": 1.98571612661195e-05, + "loss": 0.3689, + "num_tokens": 381611002.0, + "step": 500 + }, + { + "epoch": 0.6841560602989342, + "grad_norm": 0.24105329409525156, + "learning_rate": 1.9856277811805788e-05, + "loss": 0.3972, + "num_tokens": 382420963.0, + "step": 501 + }, + { + "epoch": 0.6855216412576147, + "grad_norm": 0.22876948749965997, + "learning_rate": 1.9855391655825246e-05, + "loss": 0.3928, + "num_tokens": 383164139.0, + "step": 502 + }, + { + "epoch": 0.6868872222162952, + "grad_norm": 0.2579673005328045, + "learning_rate": 1.9854502798448208e-05, + "loss": 0.3745, + "num_tokens": 383873544.0, + "step": 503 + }, + { + "epoch": 0.6882528031749757, + "grad_norm": 0.2461310513626293, + "learning_rate": 1.985361123994582e-05, + "loss": 0.3738, + "num_tokens": 384672596.0, + "step": 504 + }, + { + "epoch": 0.6896183841336563, + "grad_norm": 0.28338509927491723, + "learning_rate": 1.9852716980590057e-05, + "loss": 0.3828, + "num_tokens": 385339695.0, + "step": 505 + }, + { + "epoch": 0.6909839650923367, + "grad_norm": 0.2338473085333619, + "learning_rate": 1.985182002065373e-05, + "loss": 0.3868, + "num_tokens": 386117976.0, + "step": 506 + }, + { + "epoch": 0.6923495460510173, + "grad_norm": 0.3058844564979638, + "learning_rate": 1.9850920360410447e-05, + "loss": 0.3895, + "num_tokens": 386905846.0, + "step": 507 + }, + { + "epoch": 0.6937151270096977, + "grad_norm": 0.24881858920143227, + "learning_rate": 1.9850018000134665e-05, + "loss": 0.3702, + "num_tokens": 387557365.0, + "step": 508 + }, + { + "epoch": 0.6950807079683783, + "grad_norm": 0.26122840195750624, + "learning_rate": 1.984911294010165e-05, + "loss": 0.383, + "num_tokens": 388355725.0, + "step": 509 + }, + { + "epoch": 0.6964462889270587, + "grad_norm": 0.2528416838151127, + "learning_rate": 1.9848205180587507e-05, + "loss": 0.3731, + "num_tokens": 389089982.0, + "step": 510 + }, + { + "epoch": 0.6978118698857393, + "grad_norm": 0.25373699637052777, + "learning_rate": 1.984729472186914e-05, + "loss": 0.3695, + "num_tokens": 389842970.0, + "step": 511 + }, + { + "epoch": 0.6991774508444197, + "grad_norm": 0.25595011310613536, + "learning_rate": 1.98463815642243e-05, + "loss": 0.4078, + "num_tokens": 390674832.0, + "step": 512 + }, + { + "epoch": 0.7005430318031003, + "grad_norm": 0.2283173779633846, + "learning_rate": 1.984546570793155e-05, + "loss": 0.3953, + "num_tokens": 391477961.0, + "step": 513 + }, + { + "epoch": 0.7019086127617808, + "grad_norm": 0.24127946688575574, + "learning_rate": 1.984454715327027e-05, + "loss": 0.3735, + "num_tokens": 392221351.0, + "step": 514 + }, + { + "epoch": 0.7032741937204613, + "grad_norm": 0.22195864373925833, + "learning_rate": 1.9843625900520685e-05, + "loss": 0.3784, + "num_tokens": 392928524.0, + "step": 515 + }, + { + "epoch": 0.7046397746791419, + "grad_norm": 0.2628662633547868, + "learning_rate": 1.9842701949963824e-05, + "loss": 0.3979, + "num_tokens": 393638153.0, + "step": 516 + }, + { + "epoch": 0.7060053556378223, + "grad_norm": 0.2193066546143698, + "learning_rate": 1.9841775301881538e-05, + "loss": 0.3805, + "num_tokens": 394382313.0, + "step": 517 + }, + { + "epoch": 0.7073709365965029, + "grad_norm": 0.23903929222407663, + "learning_rate": 1.9840845956556514e-05, + "loss": 0.395, + "num_tokens": 395182565.0, + "step": 518 + }, + { + "epoch": 0.7087365175551833, + "grad_norm": 0.22097089974659156, + "learning_rate": 1.9839913914272254e-05, + "loss": 0.3783, + "num_tokens": 395904917.0, + "step": 519 + }, + { + "epoch": 0.7101020985138639, + "grad_norm": 0.2470922722796958, + "learning_rate": 1.9838979175313083e-05, + "loss": 0.4086, + "num_tokens": 396652902.0, + "step": 520 + }, + { + "epoch": 0.7114676794725443, + "grad_norm": 0.22726727816183342, + "learning_rate": 1.983804173996415e-05, + "loss": 0.3769, + "num_tokens": 397450311.0, + "step": 521 + }, + { + "epoch": 0.7128332604312249, + "grad_norm": 0.23057022631657506, + "learning_rate": 1.9837101608511423e-05, + "loss": 0.3654, + "num_tokens": 398205440.0, + "step": 522 + }, + { + "epoch": 0.7141988413899054, + "grad_norm": 0.2530875570489181, + "learning_rate": 1.9836158781241696e-05, + "loss": 0.3836, + "num_tokens": 398927088.0, + "step": 523 + }, + { + "epoch": 0.7155644223485859, + "grad_norm": 0.27484578253896275, + "learning_rate": 1.9835213258442588e-05, + "loss": 0.3907, + "num_tokens": 399686370.0, + "step": 524 + }, + { + "epoch": 0.7169300033072664, + "grad_norm": 0.25455842237924364, + "learning_rate": 1.9834265040402538e-05, + "loss": 0.3943, + "num_tokens": 400413458.0, + "step": 525 + }, + { + "epoch": 0.7182955842659469, + "grad_norm": 0.2817596902691573, + "learning_rate": 1.98333141274108e-05, + "loss": 0.3838, + "num_tokens": 401149225.0, + "step": 526 + }, + { + "epoch": 0.7196611652246274, + "grad_norm": 0.24331185460597055, + "learning_rate": 1.9832360519757462e-05, + "loss": 0.3667, + "num_tokens": 401892171.0, + "step": 527 + }, + { + "epoch": 0.7210267461833079, + "grad_norm": 0.23574338178878287, + "learning_rate": 1.9831404217733426e-05, + "loss": 0.3882, + "num_tokens": 402646606.0, + "step": 528 + }, + { + "epoch": 0.7223923271419884, + "grad_norm": 0.24004211424765443, + "learning_rate": 1.9830445221630418e-05, + "loss": 0.3699, + "num_tokens": 403408801.0, + "step": 529 + }, + { + "epoch": 0.7237579081006689, + "grad_norm": 0.24192801205690154, + "learning_rate": 1.982948353174099e-05, + "loss": 0.3858, + "num_tokens": 404167900.0, + "step": 530 + }, + { + "epoch": 0.7251234890593494, + "grad_norm": 0.2341700165648956, + "learning_rate": 1.9828519148358505e-05, + "loss": 0.3721, + "num_tokens": 404939438.0, + "step": 531 + }, + { + "epoch": 0.72648907001803, + "grad_norm": 0.23697310002511854, + "learning_rate": 1.982755207177716e-05, + "loss": 0.3798, + "num_tokens": 405642199.0, + "step": 532 + }, + { + "epoch": 0.7278546509767104, + "grad_norm": 0.25967280968484235, + "learning_rate": 1.982658230229197e-05, + "loss": 0.4015, + "num_tokens": 406455367.0, + "step": 533 + }, + { + "epoch": 0.729220231935391, + "grad_norm": 0.23755660874203546, + "learning_rate": 1.9825609840198764e-05, + "loss": 0.3809, + "num_tokens": 407269972.0, + "step": 534 + }, + { + "epoch": 0.7305858128940714, + "grad_norm": 0.23348182751311877, + "learning_rate": 1.9824634685794198e-05, + "loss": 0.3811, + "num_tokens": 408061462.0, + "step": 535 + }, + { + "epoch": 0.731951393852752, + "grad_norm": 0.24165279724074662, + "learning_rate": 1.9823656839375753e-05, + "loss": 0.3955, + "num_tokens": 408786224.0, + "step": 536 + }, + { + "epoch": 0.7333169748114324, + "grad_norm": 0.2468350845226071, + "learning_rate": 1.9822676301241728e-05, + "loss": 0.378, + "num_tokens": 409563687.0, + "step": 537 + }, + { + "epoch": 0.734682555770113, + "grad_norm": 0.2478622719683164, + "learning_rate": 1.9821693071691237e-05, + "loss": 0.3936, + "num_tokens": 410341422.0, + "step": 538 + }, + { + "epoch": 0.7360481367287934, + "grad_norm": 0.25420915420800777, + "learning_rate": 1.982070715102423e-05, + "loss": 0.3862, + "num_tokens": 411058975.0, + "step": 539 + }, + { + "epoch": 0.737413717687474, + "grad_norm": 0.22002184202626918, + "learning_rate": 1.9819718539541463e-05, + "loss": 0.3839, + "num_tokens": 411855286.0, + "step": 540 + }, + { + "epoch": 0.7387792986461545, + "grad_norm": 0.3001776094826476, + "learning_rate": 1.9818727237544516e-05, + "loss": 0.3986, + "num_tokens": 412665147.0, + "step": 541 + }, + { + "epoch": 0.740144879604835, + "grad_norm": 0.25786898262664704, + "learning_rate": 1.9817733245335797e-05, + "loss": 0.3947, + "num_tokens": 413455885.0, + "step": 542 + }, + { + "epoch": 0.7415104605635155, + "grad_norm": 0.2603732577565379, + "learning_rate": 1.9816736563218527e-05, + "loss": 0.3846, + "num_tokens": 414187350.0, + "step": 543 + }, + { + "epoch": 0.742876041522196, + "grad_norm": 0.23779622601478156, + "learning_rate": 1.9815737191496757e-05, + "loss": 0.3763, + "num_tokens": 414928387.0, + "step": 544 + }, + { + "epoch": 0.7442416224808766, + "grad_norm": 0.22553059306561224, + "learning_rate": 1.9814735130475343e-05, + "loss": 0.3907, + "num_tokens": 415837678.0, + "step": 545 + }, + { + "epoch": 0.745607203439557, + "grad_norm": 0.24740338366171932, + "learning_rate": 1.981373038045997e-05, + "loss": 0.3903, + "num_tokens": 416571596.0, + "step": 546 + }, + { + "epoch": 0.7469727843982376, + "grad_norm": 0.217503603083374, + "learning_rate": 1.9812722941757158e-05, + "loss": 0.3765, + "num_tokens": 417331981.0, + "step": 547 + }, + { + "epoch": 0.748338365356918, + "grad_norm": 0.24288815763146587, + "learning_rate": 1.9811712814674217e-05, + "loss": 0.3647, + "num_tokens": 418077619.0, + "step": 548 + }, + { + "epoch": 0.7497039463155986, + "grad_norm": 0.26163525869359044, + "learning_rate": 1.98106999995193e-05, + "loss": 0.3692, + "num_tokens": 418753245.0, + "step": 549 + }, + { + "epoch": 0.7510695272742791, + "grad_norm": 0.23163428275016085, + "learning_rate": 1.980968449660137e-05, + "loss": 0.386, + "num_tokens": 419512346.0, + "step": 550 + }, + { + "epoch": 0.7524351082329596, + "grad_norm": 0.24983584864385303, + "learning_rate": 1.9808666306230216e-05, + "loss": 0.3726, + "num_tokens": 420240679.0, + "step": 551 + }, + { + "epoch": 0.7538006891916401, + "grad_norm": 0.25231398309944786, + "learning_rate": 1.9807645428716446e-05, + "loss": 0.3762, + "num_tokens": 420969087.0, + "step": 552 + }, + { + "epoch": 0.7551662701503206, + "grad_norm": 0.2296403351380135, + "learning_rate": 1.980662186437148e-05, + "loss": 0.3851, + "num_tokens": 421664482.0, + "step": 553 + }, + { + "epoch": 0.7565318511090011, + "grad_norm": 0.24567345732314982, + "learning_rate": 1.9805595613507568e-05, + "loss": 0.3705, + "num_tokens": 422390017.0, + "step": 554 + }, + { + "epoch": 0.7578974320676816, + "grad_norm": 0.23182413087172507, + "learning_rate": 1.980456667643777e-05, + "loss": 0.3799, + "num_tokens": 423141101.0, + "step": 555 + }, + { + "epoch": 0.7592630130263621, + "grad_norm": 0.2351678365619299, + "learning_rate": 1.9803535053475972e-05, + "loss": 0.3801, + "num_tokens": 423911241.0, + "step": 556 + }, + { + "epoch": 0.7606285939850426, + "grad_norm": 0.25878287215060225, + "learning_rate": 1.980250074493688e-05, + "loss": 0.3635, + "num_tokens": 424629204.0, + "step": 557 + }, + { + "epoch": 0.7619941749437231, + "grad_norm": 0.23582082867021878, + "learning_rate": 1.9801463751136013e-05, + "loss": 0.3842, + "num_tokens": 425332262.0, + "step": 558 + }, + { + "epoch": 0.7633597559024037, + "grad_norm": 0.24861234568698637, + "learning_rate": 1.9800424072389716e-05, + "loss": 0.3716, + "num_tokens": 426135873.0, + "step": 559 + }, + { + "epoch": 0.7647253368610841, + "grad_norm": 0.24766074717936187, + "learning_rate": 1.9799381709015143e-05, + "loss": 0.3725, + "num_tokens": 426854958.0, + "step": 560 + }, + { + "epoch": 0.7660909178197647, + "grad_norm": 0.23657253194854444, + "learning_rate": 1.9798336661330286e-05, + "loss": 0.3744, + "num_tokens": 427580781.0, + "step": 561 + }, + { + "epoch": 0.7674564987784451, + "grad_norm": 0.2423998623700624, + "learning_rate": 1.9797288929653935e-05, + "loss": 0.3772, + "num_tokens": 428332677.0, + "step": 562 + }, + { + "epoch": 0.7688220797371257, + "grad_norm": 0.23620336491932659, + "learning_rate": 1.979623851430571e-05, + "loss": 0.3767, + "num_tokens": 429196780.0, + "step": 563 + }, + { + "epoch": 0.7701876606958061, + "grad_norm": 0.24285190155311534, + "learning_rate": 1.9795185415606045e-05, + "loss": 0.3739, + "num_tokens": 429959680.0, + "step": 564 + }, + { + "epoch": 0.7715532416544867, + "grad_norm": 0.2188096444122681, + "learning_rate": 1.9794129633876198e-05, + "loss": 0.3791, + "num_tokens": 430737865.0, + "step": 565 + }, + { + "epoch": 0.7729188226131671, + "grad_norm": 0.2291050916443397, + "learning_rate": 1.9793071169438244e-05, + "loss": 0.378, + "num_tokens": 431481363.0, + "step": 566 + }, + { + "epoch": 0.7742844035718477, + "grad_norm": 0.2223169199335638, + "learning_rate": 1.979201002261507e-05, + "loss": 0.3816, + "num_tokens": 432280602.0, + "step": 567 + }, + { + "epoch": 0.7756499845305282, + "grad_norm": 0.2277022540324355, + "learning_rate": 1.9790946193730384e-05, + "loss": 0.3835, + "num_tokens": 433046763.0, + "step": 568 + }, + { + "epoch": 0.7770155654892087, + "grad_norm": 0.2212733199414894, + "learning_rate": 1.9789879683108722e-05, + "loss": 0.3799, + "num_tokens": 433830679.0, + "step": 569 + }, + { + "epoch": 0.7783811464478892, + "grad_norm": 0.22542859032477075, + "learning_rate": 1.9788810491075425e-05, + "loss": 0.359, + "num_tokens": 434524050.0, + "step": 570 + }, + { + "epoch": 0.7797467274065697, + "grad_norm": 0.2342874963785997, + "learning_rate": 1.978773861795666e-05, + "loss": 0.3815, + "num_tokens": 435330086.0, + "step": 571 + }, + { + "epoch": 0.7811123083652503, + "grad_norm": 0.22814192216046725, + "learning_rate": 1.97866640640794e-05, + "loss": 0.3798, + "num_tokens": 436139120.0, + "step": 572 + }, + { + "epoch": 0.7824778893239307, + "grad_norm": 0.2331823325460082, + "learning_rate": 1.978558682977146e-05, + "loss": 0.3682, + "num_tokens": 436860759.0, + "step": 573 + }, + { + "epoch": 0.7838434702826113, + "grad_norm": 0.2557923886981176, + "learning_rate": 1.9784506915361445e-05, + "loss": 0.3755, + "num_tokens": 437661182.0, + "step": 574 + }, + { + "epoch": 0.7852090512412917, + "grad_norm": 0.2348809627974478, + "learning_rate": 1.9783424321178797e-05, + "loss": 0.3795, + "num_tokens": 438449444.0, + "step": 575 + }, + { + "epoch": 0.7865746321999723, + "grad_norm": 0.2784741532632914, + "learning_rate": 1.9782339047553767e-05, + "loss": 0.3903, + "num_tokens": 439327369.0, + "step": 576 + }, + { + "epoch": 0.7879402131586528, + "grad_norm": 0.23968630750825912, + "learning_rate": 1.9781251094817423e-05, + "loss": 0.3851, + "num_tokens": 440150905.0, + "step": 577 + }, + { + "epoch": 0.7893057941173333, + "grad_norm": 0.23687336796140596, + "learning_rate": 1.9780160463301653e-05, + "loss": 0.3814, + "num_tokens": 440966144.0, + "step": 578 + }, + { + "epoch": 0.7906713750760138, + "grad_norm": 0.2134502546757845, + "learning_rate": 1.9779067153339165e-05, + "loss": 0.391, + "num_tokens": 441775252.0, + "step": 579 + }, + { + "epoch": 0.7920369560346943, + "grad_norm": 0.23513431515710911, + "learning_rate": 1.9777971165263477e-05, + "loss": 0.3865, + "num_tokens": 442486015.0, + "step": 580 + }, + { + "epoch": 0.7934025369933748, + "grad_norm": 0.24181196565377724, + "learning_rate": 1.9776872499408925e-05, + "loss": 0.3738, + "num_tokens": 443232818.0, + "step": 581 + }, + { + "epoch": 0.7947681179520553, + "grad_norm": 0.22997897649246346, + "learning_rate": 1.977577115611067e-05, + "loss": 0.3797, + "num_tokens": 443987483.0, + "step": 582 + }, + { + "epoch": 0.7961336989107358, + "grad_norm": 0.2593450511775391, + "learning_rate": 1.977466713570468e-05, + "loss": 0.3859, + "num_tokens": 444714986.0, + "step": 583 + }, + { + "epoch": 0.7974992798694163, + "grad_norm": 0.22856268800651694, + "learning_rate": 1.9773560438527748e-05, + "loss": 0.3831, + "num_tokens": 445546278.0, + "step": 584 + }, + { + "epoch": 0.7988648608280968, + "grad_norm": 0.24982362205340583, + "learning_rate": 1.9772451064917473e-05, + "loss": 0.3666, + "num_tokens": 446283839.0, + "step": 585 + }, + { + "epoch": 0.8002304417867774, + "grad_norm": 0.3205232125146528, + "learning_rate": 1.9771339015212283e-05, + "loss": 0.3756, + "num_tokens": 446984657.0, + "step": 586 + }, + { + "epoch": 0.8015960227454578, + "grad_norm": 0.22394185488731852, + "learning_rate": 1.9770224289751413e-05, + "loss": 0.3862, + "num_tokens": 447723211.0, + "step": 587 + }, + { + "epoch": 0.8029616037041384, + "grad_norm": 0.24549860293359288, + "learning_rate": 1.976910688887492e-05, + "loss": 0.3808, + "num_tokens": 448502325.0, + "step": 588 + }, + { + "epoch": 0.8043271846628188, + "grad_norm": 0.2465950684973369, + "learning_rate": 1.9767986812923672e-05, + "loss": 0.4037, + "num_tokens": 449250620.0, + "step": 589 + }, + { + "epoch": 0.8056927656214994, + "grad_norm": 0.24912159289975766, + "learning_rate": 1.9766864062239354e-05, + "loss": 0.3951, + "num_tokens": 450121084.0, + "step": 590 + }, + { + "epoch": 0.8070583465801798, + "grad_norm": 0.26089058276887034, + "learning_rate": 1.976573863716448e-05, + "loss": 0.3776, + "num_tokens": 450902615.0, + "step": 591 + }, + { + "epoch": 0.8084239275388604, + "grad_norm": 0.2883150556050347, + "learning_rate": 1.976461053804235e-05, + "loss": 0.3832, + "num_tokens": 451655810.0, + "step": 592 + }, + { + "epoch": 0.8097895084975408, + "grad_norm": 0.2504770907786986, + "learning_rate": 1.976347976521711e-05, + "loss": 0.3701, + "num_tokens": 452387810.0, + "step": 593 + }, + { + "epoch": 0.8111550894562214, + "grad_norm": 0.23527347768674112, + "learning_rate": 1.976234631903371e-05, + "loss": 0.3681, + "num_tokens": 453195850.0, + "step": 594 + }, + { + "epoch": 0.812520670414902, + "grad_norm": 0.22360428801954593, + "learning_rate": 1.976121019983791e-05, + "loss": 0.3856, + "num_tokens": 453924672.0, + "step": 595 + }, + { + "epoch": 0.8138862513735824, + "grad_norm": 0.23516510280380343, + "learning_rate": 1.9760071407976297e-05, + "loss": 0.3728, + "num_tokens": 454674914.0, + "step": 596 + }, + { + "epoch": 0.815251832332263, + "grad_norm": 0.22256579676769378, + "learning_rate": 1.9758929943796264e-05, + "loss": 0.3623, + "num_tokens": 455425901.0, + "step": 597 + }, + { + "epoch": 0.8166174132909434, + "grad_norm": 0.21090538292293234, + "learning_rate": 1.9757785807646018e-05, + "loss": 0.3738, + "num_tokens": 456187892.0, + "step": 598 + }, + { + "epoch": 0.817982994249624, + "grad_norm": 0.2118197093454376, + "learning_rate": 1.9756638999874595e-05, + "loss": 0.3859, + "num_tokens": 456952090.0, + "step": 599 + }, + { + "epoch": 0.8193485752083044, + "grad_norm": 0.23993957159349516, + "learning_rate": 1.975548952083183e-05, + "loss": 0.3903, + "num_tokens": 457759460.0, + "step": 600 + }, + { + "epoch": 0.820714156166985, + "grad_norm": 0.22474858226223499, + "learning_rate": 1.9754337370868375e-05, + "loss": 0.3886, + "num_tokens": 458501714.0, + "step": 601 + }, + { + "epoch": 0.8220797371256654, + "grad_norm": 0.23954931191966175, + "learning_rate": 1.975318255033571e-05, + "loss": 0.3879, + "num_tokens": 459274331.0, + "step": 602 + }, + { + "epoch": 0.823445318084346, + "grad_norm": 0.22972783258852786, + "learning_rate": 1.9752025059586117e-05, + "loss": 0.3868, + "num_tokens": 459976740.0, + "step": 603 + }, + { + "epoch": 0.8248108990430265, + "grad_norm": 0.24829074202883777, + "learning_rate": 1.9750864898972695e-05, + "loss": 0.3732, + "num_tokens": 460698062.0, + "step": 604 + }, + { + "epoch": 0.826176480001707, + "grad_norm": 0.21473849989869662, + "learning_rate": 1.9749702068849363e-05, + "loss": 0.3686, + "num_tokens": 461482739.0, + "step": 605 + }, + { + "epoch": 0.8275420609603875, + "grad_norm": 0.22919416220071162, + "learning_rate": 1.9748536569570843e-05, + "loss": 0.3845, + "num_tokens": 462328274.0, + "step": 606 + }, + { + "epoch": 0.828907641919068, + "grad_norm": 0.23072253824361572, + "learning_rate": 1.9747368401492684e-05, + "loss": 0.3635, + "num_tokens": 463065850.0, + "step": 607 + }, + { + "epoch": 0.8302732228777485, + "grad_norm": 0.2356488836200443, + "learning_rate": 1.9746197564971243e-05, + "loss": 0.379, + "num_tokens": 463809036.0, + "step": 608 + }, + { + "epoch": 0.831638803836429, + "grad_norm": 0.23259246905184328, + "learning_rate": 1.974502406036369e-05, + "loss": 0.3766, + "num_tokens": 464564339.0, + "step": 609 + }, + { + "epoch": 0.8330043847951095, + "grad_norm": 0.24030443665738932, + "learning_rate": 1.9743847888028015e-05, + "loss": 0.3835, + "num_tokens": 465398163.0, + "step": 610 + }, + { + "epoch": 0.83436996575379, + "grad_norm": 0.2277197308433794, + "learning_rate": 1.9742669048323008e-05, + "loss": 0.3853, + "num_tokens": 466189270.0, + "step": 611 + }, + { + "epoch": 0.8357355467124705, + "grad_norm": 0.21942654499212572, + "learning_rate": 1.9741487541608288e-05, + "loss": 0.3826, + "num_tokens": 467007726.0, + "step": 612 + }, + { + "epoch": 0.8371011276711511, + "grad_norm": 0.2330821982686077, + "learning_rate": 1.9740303368244284e-05, + "loss": 0.3514, + "num_tokens": 467673955.0, + "step": 613 + }, + { + "epoch": 0.8384667086298315, + "grad_norm": 0.23616162427208337, + "learning_rate": 1.9739116528592228e-05, + "loss": 0.3742, + "num_tokens": 468434142.0, + "step": 614 + }, + { + "epoch": 0.8398322895885121, + "grad_norm": 0.22004329045624535, + "learning_rate": 1.973792702301418e-05, + "loss": 0.3819, + "num_tokens": 469207037.0, + "step": 615 + }, + { + "epoch": 0.8411978705471925, + "grad_norm": 0.2310741474815828, + "learning_rate": 1.9736734851873005e-05, + "loss": 0.3692, + "num_tokens": 469900771.0, + "step": 616 + }, + { + "epoch": 0.8425634515058731, + "grad_norm": 0.22679386651007433, + "learning_rate": 1.9735540015532382e-05, + "loss": 0.3742, + "num_tokens": 470685956.0, + "step": 617 + }, + { + "epoch": 0.8439290324645535, + "grad_norm": 0.21893249318705613, + "learning_rate": 1.9734342514356803e-05, + "loss": 0.3837, + "num_tokens": 471399655.0, + "step": 618 + }, + { + "epoch": 0.8452946134232341, + "grad_norm": 0.24953887721403875, + "learning_rate": 1.973314234871158e-05, + "loss": 0.3782, + "num_tokens": 472232989.0, + "step": 619 + }, + { + "epoch": 0.8466601943819145, + "grad_norm": 0.22796048696586324, + "learning_rate": 1.9731939518962823e-05, + "loss": 0.3645, + "num_tokens": 472988964.0, + "step": 620 + }, + { + "epoch": 0.8480257753405951, + "grad_norm": 0.22821522242862166, + "learning_rate": 1.9730734025477467e-05, + "loss": 0.3868, + "num_tokens": 473815786.0, + "step": 621 + }, + { + "epoch": 0.8493913562992756, + "grad_norm": 0.24489662423830855, + "learning_rate": 1.9729525868623254e-05, + "loss": 0.3817, + "num_tokens": 474550755.0, + "step": 622 + }, + { + "epoch": 0.8507569372579561, + "grad_norm": 0.23760887157420488, + "learning_rate": 1.972831504876875e-05, + "loss": 0.3932, + "num_tokens": 475315571.0, + "step": 623 + }, + { + "epoch": 0.8521225182166366, + "grad_norm": 0.22876048170493865, + "learning_rate": 1.9727101566283307e-05, + "loss": 0.3736, + "num_tokens": 476008579.0, + "step": 624 + }, + { + "epoch": 0.8534880991753171, + "grad_norm": 0.23494991281974495, + "learning_rate": 1.9725885421537116e-05, + "loss": 0.3911, + "num_tokens": 476811527.0, + "step": 625 + }, + { + "epoch": 0.8548536801339977, + "grad_norm": 0.2316856448414772, + "learning_rate": 1.972466661490117e-05, + "loss": 0.3832, + "num_tokens": 477563397.0, + "step": 626 + }, + { + "epoch": 0.8562192610926781, + "grad_norm": 0.21441754690128814, + "learning_rate": 1.9723445146747277e-05, + "loss": 0.3772, + "num_tokens": 478362280.0, + "step": 627 + }, + { + "epoch": 0.8575848420513587, + "grad_norm": 0.23297456924908996, + "learning_rate": 1.9722221017448045e-05, + "loss": 0.3908, + "num_tokens": 479230529.0, + "step": 628 + }, + { + "epoch": 0.8589504230100391, + "grad_norm": 0.22225694243110378, + "learning_rate": 1.9720994227376908e-05, + "loss": 0.3836, + "num_tokens": 480010501.0, + "step": 629 + }, + { + "epoch": 0.8603160039687197, + "grad_norm": 0.2125115149147555, + "learning_rate": 1.971976477690811e-05, + "loss": 0.3812, + "num_tokens": 480747299.0, + "step": 630 + }, + { + "epoch": 0.8616815849274002, + "grad_norm": 0.20654861882721307, + "learning_rate": 1.9718532666416695e-05, + "loss": 0.3688, + "num_tokens": 481482682.0, + "step": 631 + }, + { + "epoch": 0.8630471658860807, + "grad_norm": 0.2416448641446861, + "learning_rate": 1.9717297896278535e-05, + "loss": 0.3778, + "num_tokens": 482249063.0, + "step": 632 + }, + { + "epoch": 0.8644127468447612, + "grad_norm": 0.22426248538017754, + "learning_rate": 1.97160604668703e-05, + "loss": 0.3814, + "num_tokens": 482979943.0, + "step": 633 + }, + { + "epoch": 0.8657783278034417, + "grad_norm": 0.2552756736492648, + "learning_rate": 1.9714820378569474e-05, + "loss": 0.3955, + "num_tokens": 483814857.0, + "step": 634 + }, + { + "epoch": 0.8671439087621222, + "grad_norm": 0.22092178064559376, + "learning_rate": 1.971357763175436e-05, + "loss": 0.3956, + "num_tokens": 484562127.0, + "step": 635 + }, + { + "epoch": 0.8685094897208027, + "grad_norm": 0.21609906284675354, + "learning_rate": 1.9712332226804065e-05, + "loss": 0.3758, + "num_tokens": 485331121.0, + "step": 636 + }, + { + "epoch": 0.8698750706794832, + "grad_norm": 0.21126693038566016, + "learning_rate": 1.9711084164098506e-05, + "loss": 0.3702, + "num_tokens": 486141888.0, + "step": 637 + }, + { + "epoch": 0.8712406516381637, + "grad_norm": 0.21407447765340132, + "learning_rate": 1.970983344401841e-05, + "loss": 0.3778, + "num_tokens": 486843415.0, + "step": 638 + }, + { + "epoch": 0.8726062325968442, + "grad_norm": 0.22318803674207643, + "learning_rate": 1.9708580066945325e-05, + "loss": 0.37, + "num_tokens": 487664048.0, + "step": 639 + }, + { + "epoch": 0.8739718135555248, + "grad_norm": 0.26782782092336077, + "learning_rate": 1.9707324033261594e-05, + "loss": 0.366, + "num_tokens": 488443278.0, + "step": 640 + }, + { + "epoch": 0.8753373945142052, + "grad_norm": 0.2278862813802831, + "learning_rate": 1.970606534335039e-05, + "loss": 0.374, + "num_tokens": 489205290.0, + "step": 641 + }, + { + "epoch": 0.8767029754728858, + "grad_norm": 0.23677205947476207, + "learning_rate": 1.970480399759567e-05, + "loss": 0.37, + "num_tokens": 489948809.0, + "step": 642 + }, + { + "epoch": 0.8780685564315662, + "grad_norm": 0.2308773989115437, + "learning_rate": 1.9703539996382225e-05, + "loss": 0.3733, + "num_tokens": 490776253.0, + "step": 643 + }, + { + "epoch": 0.8794341373902468, + "grad_norm": 0.22638371427570686, + "learning_rate": 1.9702273340095647e-05, + "loss": 0.3738, + "num_tokens": 491574723.0, + "step": 644 + }, + { + "epoch": 0.8807997183489272, + "grad_norm": 0.2302236984102126, + "learning_rate": 1.970100402912234e-05, + "loss": 0.3799, + "num_tokens": 492268808.0, + "step": 645 + }, + { + "epoch": 0.8821652993076078, + "grad_norm": 0.2143052225501384, + "learning_rate": 1.9699732063849504e-05, + "loss": 0.3789, + "num_tokens": 493102054.0, + "step": 646 + }, + { + "epoch": 0.8835308802662882, + "grad_norm": 0.22439245687976675, + "learning_rate": 1.9698457444665172e-05, + "loss": 0.3978, + "num_tokens": 493896107.0, + "step": 647 + }, + { + "epoch": 0.8848964612249688, + "grad_norm": 0.21033479780085776, + "learning_rate": 1.969718017195817e-05, + "loss": 0.3878, + "num_tokens": 494753487.0, + "step": 648 + }, + { + "epoch": 0.8862620421836493, + "grad_norm": 0.22935056891003955, + "learning_rate": 1.9695900246118144e-05, + "loss": 0.3569, + "num_tokens": 495525209.0, + "step": 649 + }, + { + "epoch": 0.8876276231423298, + "grad_norm": 0.237107426362609, + "learning_rate": 1.9694617667535542e-05, + "loss": 0.3804, + "num_tokens": 496280567.0, + "step": 650 + }, + { + "epoch": 0.8889932041010103, + "grad_norm": 0.22000645836477845, + "learning_rate": 1.9693332436601616e-05, + "loss": 0.3819, + "num_tokens": 497059019.0, + "step": 651 + }, + { + "epoch": 0.8903587850596908, + "grad_norm": 0.20370084472449873, + "learning_rate": 1.9692044553708444e-05, + "loss": 0.385, + "num_tokens": 497862274.0, + "step": 652 + }, + { + "epoch": 0.8917243660183714, + "grad_norm": 0.2508176624909547, + "learning_rate": 1.9690754019248897e-05, + "loss": 0.3723, + "num_tokens": 498687395.0, + "step": 653 + }, + { + "epoch": 0.8930899469770518, + "grad_norm": 0.20011022460278874, + "learning_rate": 1.968946083361666e-05, + "loss": 0.3784, + "num_tokens": 499457325.0, + "step": 654 + }, + { + "epoch": 0.8944555279357324, + "grad_norm": 0.21259949229219932, + "learning_rate": 1.9688164997206238e-05, + "loss": 0.3706, + "num_tokens": 500233706.0, + "step": 655 + }, + { + "epoch": 0.8958211088944128, + "grad_norm": 0.21057484774782864, + "learning_rate": 1.9686866510412926e-05, + "loss": 0.3855, + "num_tokens": 501092806.0, + "step": 656 + }, + { + "epoch": 0.8971866898530934, + "grad_norm": 0.23216787577618175, + "learning_rate": 1.9685565373632834e-05, + "loss": 0.3804, + "num_tokens": 501834992.0, + "step": 657 + }, + { + "epoch": 0.8985522708117739, + "grad_norm": 0.20614841009838017, + "learning_rate": 1.9684261587262888e-05, + "loss": 0.3742, + "num_tokens": 502690794.0, + "step": 658 + }, + { + "epoch": 0.8999178517704544, + "grad_norm": 0.20136177360525598, + "learning_rate": 1.9682955151700817e-05, + "loss": 0.3806, + "num_tokens": 503426763.0, + "step": 659 + }, + { + "epoch": 0.9012834327291349, + "grad_norm": 0.22228762928573298, + "learning_rate": 1.9681646067345158e-05, + "loss": 0.369, + "num_tokens": 504173303.0, + "step": 660 + }, + { + "epoch": 0.9026490136878154, + "grad_norm": 0.21876503589304166, + "learning_rate": 1.9680334334595248e-05, + "loss": 0.3737, + "num_tokens": 504997886.0, + "step": 661 + }, + { + "epoch": 0.9040145946464959, + "grad_norm": 0.24491104747026152, + "learning_rate": 1.9679019953851247e-05, + "loss": 0.3825, + "num_tokens": 505734378.0, + "step": 662 + }, + { + "epoch": 0.9053801756051764, + "grad_norm": 0.2017257147123203, + "learning_rate": 1.967770292551412e-05, + "loss": 0.3763, + "num_tokens": 506499275.0, + "step": 663 + }, + { + "epoch": 0.9067457565638569, + "grad_norm": 0.2403346270691312, + "learning_rate": 1.9676383249985624e-05, + "loss": 0.3758, + "num_tokens": 507255021.0, + "step": 664 + }, + { + "epoch": 0.9081113375225374, + "grad_norm": 0.23358497250163435, + "learning_rate": 1.9675060927668347e-05, + "loss": 0.375, + "num_tokens": 507963839.0, + "step": 665 + }, + { + "epoch": 0.9094769184812179, + "grad_norm": 0.2357773088876595, + "learning_rate": 1.9673735958965658e-05, + "loss": 0.3777, + "num_tokens": 508688312.0, + "step": 666 + }, + { + "epoch": 0.9108424994398985, + "grad_norm": 0.23783557968906405, + "learning_rate": 1.967240834428176e-05, + "loss": 0.3776, + "num_tokens": 509488948.0, + "step": 667 + }, + { + "epoch": 0.9122080803985789, + "grad_norm": 0.22183797973535546, + "learning_rate": 1.9671078084021645e-05, + "loss": 0.3818, + "num_tokens": 510270165.0, + "step": 668 + }, + { + "epoch": 0.9135736613572595, + "grad_norm": 0.21231185572697747, + "learning_rate": 1.966974517859112e-05, + "loss": 0.3781, + "num_tokens": 511054964.0, + "step": 669 + }, + { + "epoch": 0.9149392423159399, + "grad_norm": 0.2324792670616, + "learning_rate": 1.9668409628396797e-05, + "loss": 0.4018, + "num_tokens": 511833692.0, + "step": 670 + }, + { + "epoch": 0.9163048232746205, + "grad_norm": 0.1879591669283904, + "learning_rate": 1.966707143384609e-05, + "loss": 0.3747, + "num_tokens": 512558816.0, + "step": 671 + }, + { + "epoch": 0.9176704042333009, + "grad_norm": 0.22183142427459596, + "learning_rate": 1.966573059534723e-05, + "loss": 0.3536, + "num_tokens": 513311507.0, + "step": 672 + }, + { + "epoch": 0.9190359851919815, + "grad_norm": 0.2253608194948316, + "learning_rate": 1.9664387113309243e-05, + "loss": 0.3712, + "num_tokens": 514067137.0, + "step": 673 + }, + { + "epoch": 0.9204015661506619, + "grad_norm": 0.2361307673881122, + "learning_rate": 1.9663040988141973e-05, + "loss": 0.3688, + "num_tokens": 514786276.0, + "step": 674 + }, + { + "epoch": 0.9217671471093425, + "grad_norm": 0.2541250846126281, + "learning_rate": 1.9661692220256064e-05, + "loss": 0.4066, + "num_tokens": 515568565.0, + "step": 675 + }, + { + "epoch": 0.923132728068023, + "grad_norm": 0.21298254805290392, + "learning_rate": 1.966034081006296e-05, + "loss": 0.3722, + "num_tokens": 516327301.0, + "step": 676 + }, + { + "epoch": 0.9244983090267035, + "grad_norm": 0.2562403494200992, + "learning_rate": 1.9658986757974925e-05, + "loss": 0.3993, + "num_tokens": 517075995.0, + "step": 677 + }, + { + "epoch": 0.925863889985384, + "grad_norm": 0.21201433865362707, + "learning_rate": 1.965763006440502e-05, + "loss": 0.3815, + "num_tokens": 517845488.0, + "step": 678 + }, + { + "epoch": 0.9272294709440645, + "grad_norm": 0.24872730840373092, + "learning_rate": 1.9656270729767112e-05, + "loss": 0.3696, + "num_tokens": 518561677.0, + "step": 679 + }, + { + "epoch": 0.928595051902745, + "grad_norm": 0.23794106185666544, + "learning_rate": 1.965490875447587e-05, + "loss": 0.3925, + "num_tokens": 519310106.0, + "step": 680 + }, + { + "epoch": 0.9299606328614255, + "grad_norm": 0.24094389389057022, + "learning_rate": 1.9653544138946784e-05, + "loss": 0.3647, + "num_tokens": 520024498.0, + "step": 681 + }, + { + "epoch": 0.931326213820106, + "grad_norm": 0.2338332993880323, + "learning_rate": 1.9652176883596136e-05, + "loss": 0.3731, + "num_tokens": 520781116.0, + "step": 682 + }, + { + "epoch": 0.9326917947787865, + "grad_norm": 0.22942082959991267, + "learning_rate": 1.9650806988841013e-05, + "loss": 0.3712, + "num_tokens": 521535126.0, + "step": 683 + }, + { + "epoch": 0.934057375737467, + "grad_norm": 0.2051553856267546, + "learning_rate": 1.964943445509931e-05, + "loss": 0.3552, + "num_tokens": 522310489.0, + "step": 684 + }, + { + "epoch": 0.9354229566961476, + "grad_norm": 0.2695420606518578, + "learning_rate": 1.9648059282789736e-05, + "loss": 0.3921, + "num_tokens": 523133176.0, + "step": 685 + }, + { + "epoch": 0.9367885376548281, + "grad_norm": 0.20953155093853865, + "learning_rate": 1.9646681472331786e-05, + "loss": 0.3806, + "num_tokens": 523874078.0, + "step": 686 + }, + { + "epoch": 0.9381541186135086, + "grad_norm": 0.2598467679287756, + "learning_rate": 1.9645301024145774e-05, + "loss": 0.3981, + "num_tokens": 524699884.0, + "step": 687 + }, + { + "epoch": 0.9395196995721891, + "grad_norm": 0.21649808202531495, + "learning_rate": 1.9643917938652818e-05, + "loss": 0.3729, + "num_tokens": 525514167.0, + "step": 688 + }, + { + "epoch": 0.9408852805308696, + "grad_norm": 0.246397584115257, + "learning_rate": 1.9642532216274835e-05, + "loss": 0.3738, + "num_tokens": 526284972.0, + "step": 689 + }, + { + "epoch": 0.9422508614895501, + "grad_norm": 0.25137200087968886, + "learning_rate": 1.9641143857434544e-05, + "loss": 0.3829, + "num_tokens": 527011742.0, + "step": 690 + }, + { + "epoch": 0.9436164424482306, + "grad_norm": 0.24349238916573324, + "learning_rate": 1.9639752862555483e-05, + "loss": 0.3704, + "num_tokens": 527839006.0, + "step": 691 + }, + { + "epoch": 0.9449820234069111, + "grad_norm": 0.22784975772650592, + "learning_rate": 1.963835923206198e-05, + "loss": 0.3932, + "num_tokens": 528615716.0, + "step": 692 + }, + { + "epoch": 0.9463476043655916, + "grad_norm": 0.2592418301980075, + "learning_rate": 1.963696296637917e-05, + "loss": 0.3766, + "num_tokens": 529357369.0, + "step": 693 + }, + { + "epoch": 0.9477131853242722, + "grad_norm": 0.226785712908802, + "learning_rate": 1.9635564065932994e-05, + "loss": 0.3874, + "num_tokens": 530095535.0, + "step": 694 + }, + { + "epoch": 0.9490787662829526, + "grad_norm": 0.2242018039260005, + "learning_rate": 1.9634162531150196e-05, + "loss": 0.37, + "num_tokens": 530804954.0, + "step": 695 + }, + { + "epoch": 0.9504443472416332, + "grad_norm": 0.24557588286173263, + "learning_rate": 1.9632758362458325e-05, + "loss": 0.3827, + "num_tokens": 531557175.0, + "step": 696 + }, + { + "epoch": 0.9518099282003136, + "grad_norm": 0.24643526597426926, + "learning_rate": 1.9631351560285724e-05, + "loss": 0.3916, + "num_tokens": 532314504.0, + "step": 697 + }, + { + "epoch": 0.9531755091589942, + "grad_norm": 0.22644645539793357, + "learning_rate": 1.9629942125061563e-05, + "loss": 0.3815, + "num_tokens": 533088218.0, + "step": 698 + }, + { + "epoch": 0.9545410901176746, + "grad_norm": 0.2812692190074564, + "learning_rate": 1.9628530057215784e-05, + "loss": 0.3751, + "num_tokens": 533834601.0, + "step": 699 + }, + { + "epoch": 0.9559066710763552, + "grad_norm": 0.21426000524672442, + "learning_rate": 1.9627115357179155e-05, + "loss": 0.3648, + "num_tokens": 534588437.0, + "step": 700 + }, + { + "epoch": 0.9572722520350356, + "grad_norm": 0.22514162881816935, + "learning_rate": 1.9625698025383245e-05, + "loss": 0.3536, + "num_tokens": 535343643.0, + "step": 701 + }, + { + "epoch": 0.9586378329937162, + "grad_norm": 0.2021493298704689, + "learning_rate": 1.962427806226041e-05, + "loss": 0.3765, + "num_tokens": 536171406.0, + "step": 702 + }, + { + "epoch": 0.9600034139523967, + "grad_norm": 0.2205587910484086, + "learning_rate": 1.9622855468243823e-05, + "loss": 0.363, + "num_tokens": 536964518.0, + "step": 703 + }, + { + "epoch": 0.9613689949110772, + "grad_norm": 0.21325227571656347, + "learning_rate": 1.962143024376746e-05, + "loss": 0.3878, + "num_tokens": 537756351.0, + "step": 704 + }, + { + "epoch": 0.9627345758697577, + "grad_norm": 0.21954134424156602, + "learning_rate": 1.962000238926609e-05, + "loss": 0.3583, + "num_tokens": 538470208.0, + "step": 705 + }, + { + "epoch": 0.9641001568284382, + "grad_norm": 0.1887864355795394, + "learning_rate": 1.961857190517529e-05, + "loss": 0.3812, + "num_tokens": 539278841.0, + "step": 706 + }, + { + "epoch": 0.9654657377871187, + "grad_norm": 0.2272762567364451, + "learning_rate": 1.9617138791931444e-05, + "loss": 0.366, + "num_tokens": 539923394.0, + "step": 707 + }, + { + "epoch": 0.9668313187457992, + "grad_norm": 0.22447526075519114, + "learning_rate": 1.961570304997173e-05, + "loss": 0.3907, + "num_tokens": 540659009.0, + "step": 708 + }, + { + "epoch": 0.9681968997044798, + "grad_norm": 0.19008200326565305, + "learning_rate": 1.9614264679734126e-05, + "loss": 0.3717, + "num_tokens": 541383570.0, + "step": 709 + }, + { + "epoch": 0.9695624806631602, + "grad_norm": 0.22127452924775257, + "learning_rate": 1.9612823681657424e-05, + "loss": 0.374, + "num_tokens": 542145576.0, + "step": 710 + }, + { + "epoch": 0.9709280616218408, + "grad_norm": 0.22868424422903078, + "learning_rate": 1.96113800561812e-05, + "loss": 0.3755, + "num_tokens": 542939007.0, + "step": 711 + }, + { + "epoch": 0.9722936425805213, + "grad_norm": 0.22089845681080716, + "learning_rate": 1.9609933803745854e-05, + "loss": 0.4115, + "num_tokens": 543712051.0, + "step": 712 + }, + { + "epoch": 0.9736592235392018, + "grad_norm": 0.20968855843561338, + "learning_rate": 1.9608484924792567e-05, + "loss": 0.3653, + "num_tokens": 544447877.0, + "step": 713 + }, + { + "epoch": 0.9750248044978823, + "grad_norm": 0.23661560227078382, + "learning_rate": 1.960703341976333e-05, + "loss": 0.3818, + "num_tokens": 545124888.0, + "step": 714 + }, + { + "epoch": 0.9763903854565628, + "grad_norm": 0.23160933958305577, + "learning_rate": 1.9605579289100937e-05, + "loss": 0.3901, + "num_tokens": 545890233.0, + "step": 715 + }, + { + "epoch": 0.9777559664152433, + "grad_norm": 0.23554837368433398, + "learning_rate": 1.960412253324898e-05, + "loss": 0.368, + "num_tokens": 546603773.0, + "step": 716 + }, + { + "epoch": 0.9791215473739238, + "grad_norm": 0.2286169550517569, + "learning_rate": 1.9602663152651847e-05, + "loss": 0.3786, + "num_tokens": 547453280.0, + "step": 717 + }, + { + "epoch": 0.9804871283326043, + "grad_norm": 0.22218825095315792, + "learning_rate": 1.960120114775474e-05, + "loss": 0.3813, + "num_tokens": 548172445.0, + "step": 718 + }, + { + "epoch": 0.9818527092912848, + "grad_norm": 0.2275151337830258, + "learning_rate": 1.9599736519003646e-05, + "loss": 0.3813, + "num_tokens": 548891111.0, + "step": 719 + }, + { + "epoch": 0.9832182902499653, + "grad_norm": 0.2187378857657384, + "learning_rate": 1.9598269266845367e-05, + "loss": 0.3694, + "num_tokens": 549684049.0, + "step": 720 + }, + { + "epoch": 0.9845838712086459, + "grad_norm": 0.2112126150058604, + "learning_rate": 1.9596799391727492e-05, + "loss": 0.3646, + "num_tokens": 550381777.0, + "step": 721 + }, + { + "epoch": 0.9859494521673263, + "grad_norm": 0.2437941576809699, + "learning_rate": 1.9595326894098417e-05, + "loss": 0.3871, + "num_tokens": 551248951.0, + "step": 722 + }, + { + "epoch": 0.9873150331260069, + "grad_norm": 0.2286558699225236, + "learning_rate": 1.959385177440734e-05, + "loss": 0.3701, + "num_tokens": 551973185.0, + "step": 723 + }, + { + "epoch": 0.9886806140846873, + "grad_norm": 0.2461869569615954, + "learning_rate": 1.9592374033104254e-05, + "loss": 0.3641, + "num_tokens": 552740161.0, + "step": 724 + }, + { + "epoch": 0.9900461950433679, + "grad_norm": 0.2073019114118674, + "learning_rate": 1.9590893670639955e-05, + "loss": 0.3709, + "num_tokens": 553526563.0, + "step": 725 + }, + { + "epoch": 0.9914117760020483, + "grad_norm": 0.2474048144401556, + "learning_rate": 1.9589410687466038e-05, + "loss": 0.3701, + "num_tokens": 554245398.0, + "step": 726 + }, + { + "epoch": 0.9927773569607289, + "grad_norm": 0.21628458821804883, + "learning_rate": 1.9587925084034895e-05, + "loss": 0.387, + "num_tokens": 554985005.0, + "step": 727 + }, + { + "epoch": 0.9941429379194093, + "grad_norm": 0.2523858627994551, + "learning_rate": 1.9586436860799717e-05, + "loss": 0.3958, + "num_tokens": 555748935.0, + "step": 728 + }, + { + "epoch": 0.9955085188780899, + "grad_norm": 0.2186895191900311, + "learning_rate": 1.9584946018214507e-05, + "loss": 0.3715, + "num_tokens": 556485927.0, + "step": 729 + }, + { + "epoch": 0.9968740998367704, + "grad_norm": 0.2242641076845794, + "learning_rate": 1.9583452556734043e-05, + "loss": 0.3698, + "num_tokens": 557294696.0, + "step": 730 + }, + { + "epoch": 0.9982396807954509, + "grad_norm": 0.2131695371398388, + "learning_rate": 1.9581956476813927e-05, + "loss": 0.3805, + "num_tokens": 558033148.0, + "step": 731 + }, + { + "epoch": 0.9996052617541314, + "grad_norm": 0.23320356040938617, + "learning_rate": 1.958045777891054e-05, + "loss": 0.3756, + "num_tokens": 558794487.0, + "step": 732 + }, + { + "epoch": 1.0, + "grad_norm": 0.23320356040938617, + "learning_rate": 1.957895646348107e-05, + "loss": 0.3803, + "num_tokens": 559042136.0, + "step": 733 + }, + { + "epoch": 1.0013655809586806, + "grad_norm": 0.3825135940407043, + "learning_rate": 1.957745253098351e-05, + "loss": 0.3416, + "num_tokens": 559798424.0, + "step": 734 + }, + { + "epoch": 1.0027311619173611, + "grad_norm": 0.2897650651273311, + "learning_rate": 1.9575945981876635e-05, + "loss": 0.3536, + "num_tokens": 560570310.0, + "step": 735 + }, + { + "epoch": 1.0040967428760414, + "grad_norm": 0.2290980247840941, + "learning_rate": 1.9574436816620038e-05, + "loss": 0.3438, + "num_tokens": 561257328.0, + "step": 736 + }, + { + "epoch": 1.005462323834722, + "grad_norm": 0.2908496468928532, + "learning_rate": 1.95729250356741e-05, + "loss": 0.3561, + "num_tokens": 562035083.0, + "step": 737 + }, + { + "epoch": 1.0068279047934026, + "grad_norm": 0.24474109613460202, + "learning_rate": 1.957141063949999e-05, + "loss": 0.34, + "num_tokens": 562778400.0, + "step": 738 + }, + { + "epoch": 1.0081934857520831, + "grad_norm": 0.25421021892130446, + "learning_rate": 1.9569893628559692e-05, + "loss": 0.3387, + "num_tokens": 563508421.0, + "step": 739 + }, + { + "epoch": 1.0095590667107635, + "grad_norm": 0.22870380389667097, + "learning_rate": 1.9568374003315974e-05, + "loss": 0.3512, + "num_tokens": 564286882.0, + "step": 740 + }, + { + "epoch": 1.010924647669444, + "grad_norm": 0.2645570751433239, + "learning_rate": 1.9566851764232414e-05, + "loss": 0.3539, + "num_tokens": 565065347.0, + "step": 741 + }, + { + "epoch": 1.0122902286281246, + "grad_norm": 0.2174071442958411, + "learning_rate": 1.9565326911773385e-05, + "loss": 0.3322, + "num_tokens": 565906059.0, + "step": 742 + }, + { + "epoch": 1.0136558095868051, + "grad_norm": 0.2551581290015055, + "learning_rate": 1.956379944640404e-05, + "loss": 0.3734, + "num_tokens": 566665307.0, + "step": 743 + }, + { + "epoch": 1.0150213905454857, + "grad_norm": 0.2653134988662466, + "learning_rate": 1.9562269368590357e-05, + "loss": 0.3442, + "num_tokens": 567409795.0, + "step": 744 + }, + { + "epoch": 1.016386971504166, + "grad_norm": 0.24862266844549816, + "learning_rate": 1.9560736678799088e-05, + "loss": 0.3419, + "num_tokens": 568121859.0, + "step": 745 + }, + { + "epoch": 1.0177525524628466, + "grad_norm": 0.2398461782965882, + "learning_rate": 1.955920137749779e-05, + "loss": 0.3355, + "num_tokens": 568949621.0, + "step": 746 + }, + { + "epoch": 1.0191181334215271, + "grad_norm": 0.2271463646011032, + "learning_rate": 1.9557663465154824e-05, + "loss": 0.3459, + "num_tokens": 569716202.0, + "step": 747 + }, + { + "epoch": 1.0204837143802077, + "grad_norm": 0.2374417496076064, + "learning_rate": 1.955612294223933e-05, + "loss": 0.3514, + "num_tokens": 570522506.0, + "step": 748 + }, + { + "epoch": 1.021849295338888, + "grad_norm": 0.25175071136036276, + "learning_rate": 1.9554579809221264e-05, + "loss": 0.36, + "num_tokens": 571313681.0, + "step": 749 + }, + { + "epoch": 1.0232148762975686, + "grad_norm": 0.21937228293588593, + "learning_rate": 1.9553034066571366e-05, + "loss": 0.3505, + "num_tokens": 572099039.0, + "step": 750 + }, + { + "epoch": 1.0245804572562491, + "grad_norm": 0.2252337411630276, + "learning_rate": 1.9551485714761173e-05, + "loss": 0.3406, + "num_tokens": 572867223.0, + "step": 751 + }, + { + "epoch": 1.0259460382149297, + "grad_norm": 0.23169573422516954, + "learning_rate": 1.9549934754263023e-05, + "loss": 0.3312, + "num_tokens": 573561839.0, + "step": 752 + }, + { + "epoch": 1.0273116191736102, + "grad_norm": 0.21392985642691065, + "learning_rate": 1.9548381185550047e-05, + "loss": 0.3419, + "num_tokens": 574292688.0, + "step": 753 + }, + { + "epoch": 1.0286772001322906, + "grad_norm": 0.2243638225576261, + "learning_rate": 1.9546825009096167e-05, + "loss": 0.356, + "num_tokens": 575096073.0, + "step": 754 + }, + { + "epoch": 1.0300427810909711, + "grad_norm": 0.21332838620710934, + "learning_rate": 1.9545266225376112e-05, + "loss": 0.3389, + "num_tokens": 575916595.0, + "step": 755 + }, + { + "epoch": 1.0314083620496517, + "grad_norm": 0.1992051714308279, + "learning_rate": 1.9543704834865388e-05, + "loss": 0.3444, + "num_tokens": 576712138.0, + "step": 756 + }, + { + "epoch": 1.0327739430083323, + "grad_norm": 0.21746986145790495, + "learning_rate": 1.9542140838040324e-05, + "loss": 0.3399, + "num_tokens": 577507085.0, + "step": 757 + }, + { + "epoch": 1.0341395239670126, + "grad_norm": 0.22343420622840984, + "learning_rate": 1.9540574235378014e-05, + "loss": 0.3586, + "num_tokens": 578262098.0, + "step": 758 + }, + { + "epoch": 1.0355051049256931, + "grad_norm": 0.22772194947028826, + "learning_rate": 1.9539005027356366e-05, + "loss": 0.334, + "num_tokens": 579137978.0, + "step": 759 + }, + { + "epoch": 1.0368706858843737, + "grad_norm": 0.19720526744651076, + "learning_rate": 1.953743321445408e-05, + "loss": 0.343, + "num_tokens": 579892754.0, + "step": 760 + }, + { + "epoch": 1.0382362668430543, + "grad_norm": 0.236702180067579, + "learning_rate": 1.953585879715064e-05, + "loss": 0.3508, + "num_tokens": 580625847.0, + "step": 761 + }, + { + "epoch": 1.0396018478017348, + "grad_norm": 0.23934596109582543, + "learning_rate": 1.9534281775926346e-05, + "loss": 0.3619, + "num_tokens": 581386690.0, + "step": 762 + }, + { + "epoch": 1.0409674287604151, + "grad_norm": 0.2131402454693042, + "learning_rate": 1.9532702151262262e-05, + "loss": 0.3375, + "num_tokens": 582155426.0, + "step": 763 + }, + { + "epoch": 1.0423330097190957, + "grad_norm": 0.19336758783053334, + "learning_rate": 1.9531119923640276e-05, + "loss": 0.3336, + "num_tokens": 582902124.0, + "step": 764 + }, + { + "epoch": 1.0436985906777763, + "grad_norm": 0.2114730972701699, + "learning_rate": 1.952953509354305e-05, + "loss": 0.3445, + "num_tokens": 583659563.0, + "step": 765 + }, + { + "epoch": 1.0450641716364568, + "grad_norm": 0.21433503240186524, + "learning_rate": 1.9527947661454054e-05, + "loss": 0.3481, + "num_tokens": 584429340.0, + "step": 766 + }, + { + "epoch": 1.0464297525951372, + "grad_norm": 0.21369663933442637, + "learning_rate": 1.9526357627857537e-05, + "loss": 0.3624, + "num_tokens": 585223552.0, + "step": 767 + }, + { + "epoch": 1.0477953335538177, + "grad_norm": 0.2200008386516188, + "learning_rate": 1.9524764993238553e-05, + "loss": 0.3526, + "num_tokens": 585982078.0, + "step": 768 + }, + { + "epoch": 1.0491609145124983, + "grad_norm": 0.20975499602522088, + "learning_rate": 1.9523169758082946e-05, + "loss": 0.3361, + "num_tokens": 586703202.0, + "step": 769 + }, + { + "epoch": 1.0505264954711788, + "grad_norm": 0.2009931673157653, + "learning_rate": 1.9521571922877355e-05, + "loss": 0.3611, + "num_tokens": 587497570.0, + "step": 770 + }, + { + "epoch": 1.0518920764298594, + "grad_norm": 0.22128574621686295, + "learning_rate": 1.9519971488109207e-05, + "loss": 0.334, + "num_tokens": 588184932.0, + "step": 771 + }, + { + "epoch": 1.0532576573885397, + "grad_norm": 0.21869235992327593, + "learning_rate": 1.9518368454266726e-05, + "loss": 0.3405, + "num_tokens": 588950129.0, + "step": 772 + }, + { + "epoch": 1.0546232383472203, + "grad_norm": 0.225898003741059, + "learning_rate": 1.9516762821838927e-05, + "loss": 0.3382, + "num_tokens": 589714638.0, + "step": 773 + }, + { + "epoch": 1.0559888193059008, + "grad_norm": 0.24083446279434642, + "learning_rate": 1.951515459131562e-05, + "loss": 0.3335, + "num_tokens": 590509231.0, + "step": 774 + }, + { + "epoch": 1.0573544002645814, + "grad_norm": 0.2238857327015622, + "learning_rate": 1.9513543763187415e-05, + "loss": 0.3405, + "num_tokens": 591277832.0, + "step": 775 + }, + { + "epoch": 1.0587199812232617, + "grad_norm": 0.2304878398059193, + "learning_rate": 1.951193033794569e-05, + "loss": 0.3265, + "num_tokens": 591950717.0, + "step": 776 + }, + { + "epoch": 1.0600855621819423, + "grad_norm": 0.2106306513372773, + "learning_rate": 1.951031431608264e-05, + "loss": 0.3675, + "num_tokens": 592801890.0, + "step": 777 + }, + { + "epoch": 1.0614511431406228, + "grad_norm": 0.24626933009606206, + "learning_rate": 1.9508695698091248e-05, + "loss": 0.3401, + "num_tokens": 593614528.0, + "step": 778 + }, + { + "epoch": 1.0628167240993034, + "grad_norm": 0.2036973708429303, + "learning_rate": 1.9507074484465275e-05, + "loss": 0.3273, + "num_tokens": 594406685.0, + "step": 779 + }, + { + "epoch": 1.064182305057984, + "grad_norm": 0.27014717985201614, + "learning_rate": 1.9505450675699288e-05, + "loss": 0.3469, + "num_tokens": 595158188.0, + "step": 780 + }, + { + "epoch": 1.0655478860166643, + "grad_norm": 0.21963088425190025, + "learning_rate": 1.9503824272288638e-05, + "loss": 0.3545, + "num_tokens": 595927943.0, + "step": 781 + }, + { + "epoch": 1.0669134669753448, + "grad_norm": 0.2106585797718594, + "learning_rate": 1.9502195274729472e-05, + "loss": 0.3483, + "num_tokens": 596795301.0, + "step": 782 + }, + { + "epoch": 1.0682790479340254, + "grad_norm": 0.23246998475077077, + "learning_rate": 1.950056368351873e-05, + "loss": 0.3425, + "num_tokens": 597598411.0, + "step": 783 + }, + { + "epoch": 1.069644628892706, + "grad_norm": 0.24328956963721618, + "learning_rate": 1.949892949915413e-05, + "loss": 0.3529, + "num_tokens": 598380624.0, + "step": 784 + }, + { + "epoch": 1.0710102098513863, + "grad_norm": 0.23043651709543356, + "learning_rate": 1.9497292722134204e-05, + "loss": 0.332, + "num_tokens": 599130592.0, + "step": 785 + }, + { + "epoch": 1.0723757908100668, + "grad_norm": 0.3203703356203683, + "learning_rate": 1.949565335295825e-05, + "loss": 0.3387, + "num_tokens": 599807665.0, + "step": 786 + }, + { + "epoch": 1.0737413717687474, + "grad_norm": 0.26005769731686473, + "learning_rate": 1.9494011392126375e-05, + "loss": 0.3343, + "num_tokens": 600571353.0, + "step": 787 + }, + { + "epoch": 1.075106952727428, + "grad_norm": 0.21478417178316983, + "learning_rate": 1.9492366840139468e-05, + "loss": 0.3556, + "num_tokens": 601282480.0, + "step": 788 + }, + { + "epoch": 1.0764725336861085, + "grad_norm": 0.22354619987300667, + "learning_rate": 1.9490719697499213e-05, + "loss": 0.3534, + "num_tokens": 602059188.0, + "step": 789 + }, + { + "epoch": 1.0778381146447888, + "grad_norm": 0.23341858936439303, + "learning_rate": 1.9489069964708078e-05, + "loss": 0.363, + "num_tokens": 602849615.0, + "step": 790 + }, + { + "epoch": 1.0792036956034694, + "grad_norm": 0.21369895982916975, + "learning_rate": 1.9487417642269328e-05, + "loss": 0.3432, + "num_tokens": 603610326.0, + "step": 791 + }, + { + "epoch": 1.08056927656215, + "grad_norm": 0.23214500777604785, + "learning_rate": 1.9485762730687014e-05, + "loss": 0.3395, + "num_tokens": 604338380.0, + "step": 792 + }, + { + "epoch": 1.0819348575208305, + "grad_norm": 0.20705994014107917, + "learning_rate": 1.9484105230465978e-05, + "loss": 0.3524, + "num_tokens": 605077064.0, + "step": 793 + }, + { + "epoch": 1.0833004384795109, + "grad_norm": 0.24173710866201498, + "learning_rate": 1.9482445142111852e-05, + "loss": 0.3496, + "num_tokens": 605809585.0, + "step": 794 + }, + { + "epoch": 1.0846660194381914, + "grad_norm": 0.20929193210864164, + "learning_rate": 1.9480782466131056e-05, + "loss": 0.3354, + "num_tokens": 606598541.0, + "step": 795 + }, + { + "epoch": 1.086031600396872, + "grad_norm": 0.24458946452512642, + "learning_rate": 1.9479117203030803e-05, + "loss": 0.3373, + "num_tokens": 607302298.0, + "step": 796 + }, + { + "epoch": 1.0873971813555525, + "grad_norm": 0.21853750631580782, + "learning_rate": 1.947744935331909e-05, + "loss": 0.3541, + "num_tokens": 608046682.0, + "step": 797 + }, + { + "epoch": 1.088762762314233, + "grad_norm": 0.28036874146332763, + "learning_rate": 1.9475778917504703e-05, + "loss": 0.3607, + "num_tokens": 608810756.0, + "step": 798 + }, + { + "epoch": 1.0901283432729134, + "grad_norm": 0.2127303801483816, + "learning_rate": 1.947410589609723e-05, + "loss": 0.3544, + "num_tokens": 609603922.0, + "step": 799 + }, + { + "epoch": 1.091493924231594, + "grad_norm": 0.2359714588934887, + "learning_rate": 1.9472430289607025e-05, + "loss": 0.3314, + "num_tokens": 610306230.0, + "step": 800 + }, + { + "epoch": 1.0928595051902745, + "grad_norm": 0.2433090672983506, + "learning_rate": 1.947075209854525e-05, + "loss": 0.3377, + "num_tokens": 611015690.0, + "step": 801 + }, + { + "epoch": 1.094225086148955, + "grad_norm": 0.23014334300961076, + "learning_rate": 1.9469071323423844e-05, + "loss": 0.3365, + "num_tokens": 611790112.0, + "step": 802 + }, + { + "epoch": 1.0955906671076354, + "grad_norm": 0.28148534782836104, + "learning_rate": 1.9467387964755545e-05, + "loss": 0.3454, + "num_tokens": 612578023.0, + "step": 803 + }, + { + "epoch": 1.096956248066316, + "grad_norm": 0.23569966182226046, + "learning_rate": 1.9465702023053866e-05, + "loss": 0.3623, + "num_tokens": 613357473.0, + "step": 804 + }, + { + "epoch": 1.0983218290249965, + "grad_norm": 0.206852317821413, + "learning_rate": 1.9464013498833123e-05, + "loss": 0.3355, + "num_tokens": 614079911.0, + "step": 805 + }, + { + "epoch": 1.099687409983677, + "grad_norm": 0.21388520928879326, + "learning_rate": 1.9462322392608402e-05, + "loss": 0.3447, + "num_tokens": 614976527.0, + "step": 806 + }, + { + "epoch": 1.1010529909423576, + "grad_norm": 0.18739995316002714, + "learning_rate": 1.9460628704895595e-05, + "loss": 0.3242, + "num_tokens": 615739303.0, + "step": 807 + }, + { + "epoch": 1.102418571901038, + "grad_norm": 0.23377448662966474, + "learning_rate": 1.9458932436211365e-05, + "loss": 0.3433, + "num_tokens": 616511728.0, + "step": 808 + }, + { + "epoch": 1.1037841528597185, + "grad_norm": 0.22987354440488406, + "learning_rate": 1.9457233587073177e-05, + "loss": 0.3582, + "num_tokens": 617274800.0, + "step": 809 + }, + { + "epoch": 1.105149733818399, + "grad_norm": 0.23123100282429546, + "learning_rate": 1.945553215799927e-05, + "loss": 0.3418, + "num_tokens": 618008106.0, + "step": 810 + }, + { + "epoch": 1.1065153147770797, + "grad_norm": 0.21625497188314524, + "learning_rate": 1.9453828149508684e-05, + "loss": 0.3403, + "num_tokens": 618700596.0, + "step": 811 + }, + { + "epoch": 1.10788089573576, + "grad_norm": 0.22647218761980423, + "learning_rate": 1.9452121562121232e-05, + "loss": 0.3526, + "num_tokens": 619527542.0, + "step": 812 + }, + { + "epoch": 1.1092464766944405, + "grad_norm": 0.21575319555420466, + "learning_rate": 1.945041239635752e-05, + "loss": 0.3368, + "num_tokens": 620334333.0, + "step": 813 + }, + { + "epoch": 1.110612057653121, + "grad_norm": 0.20076794897715036, + "learning_rate": 1.9448700652738943e-05, + "loss": 0.3388, + "num_tokens": 621031584.0, + "step": 814 + }, + { + "epoch": 1.1119776386118017, + "grad_norm": 0.3121453839366858, + "learning_rate": 1.944698633178768e-05, + "loss": 0.3399, + "num_tokens": 621769147.0, + "step": 815 + }, + { + "epoch": 1.1133432195704822, + "grad_norm": 0.2289933751086313, + "learning_rate": 1.9445269434026696e-05, + "loss": 0.3337, + "num_tokens": 622519658.0, + "step": 816 + }, + { + "epoch": 1.1147088005291625, + "grad_norm": 0.241690252259585, + "learning_rate": 1.9443549959979738e-05, + "loss": 0.363, + "num_tokens": 623293585.0, + "step": 817 + }, + { + "epoch": 1.116074381487843, + "grad_norm": 0.20991941188165925, + "learning_rate": 1.9441827910171347e-05, + "loss": 0.3464, + "num_tokens": 624144270.0, + "step": 818 + }, + { + "epoch": 1.1174399624465237, + "grad_norm": 0.2410147117893179, + "learning_rate": 1.9440103285126847e-05, + "loss": 0.3541, + "num_tokens": 624944787.0, + "step": 819 + }, + { + "epoch": 1.1188055434052042, + "grad_norm": 0.2051297873721244, + "learning_rate": 1.943837608537234e-05, + "loss": 0.3507, + "num_tokens": 625748395.0, + "step": 820 + }, + { + "epoch": 1.1201711243638846, + "grad_norm": 0.22664534948103088, + "learning_rate": 1.943664631143473e-05, + "loss": 0.3352, + "num_tokens": 626474429.0, + "step": 821 + }, + { + "epoch": 1.121536705322565, + "grad_norm": 0.21670359113765952, + "learning_rate": 1.943491396384169e-05, + "loss": 0.3497, + "num_tokens": 627243813.0, + "step": 822 + }, + { + "epoch": 1.1229022862812457, + "grad_norm": 0.20899784816306777, + "learning_rate": 1.9433179043121683e-05, + "loss": 0.3451, + "num_tokens": 628020340.0, + "step": 823 + }, + { + "epoch": 1.1242678672399262, + "grad_norm": 0.22411570922905472, + "learning_rate": 1.943144154980396e-05, + "loss": 0.3441, + "num_tokens": 628762821.0, + "step": 824 + }, + { + "epoch": 1.1256334481986068, + "grad_norm": 0.22009284476576324, + "learning_rate": 1.9429701484418553e-05, + "loss": 0.3323, + "num_tokens": 629539587.0, + "step": 825 + }, + { + "epoch": 1.1269990291572871, + "grad_norm": 0.21429558195937343, + "learning_rate": 1.942795884749628e-05, + "loss": 0.3612, + "num_tokens": 630258493.0, + "step": 826 + }, + { + "epoch": 1.1283646101159677, + "grad_norm": 0.22840451785751004, + "learning_rate": 1.9426213639568753e-05, + "loss": 0.3431, + "num_tokens": 631009914.0, + "step": 827 + }, + { + "epoch": 1.1297301910746482, + "grad_norm": 0.21718260945448883, + "learning_rate": 1.9424465861168353e-05, + "loss": 0.3448, + "num_tokens": 631759759.0, + "step": 828 + }, + { + "epoch": 1.1310957720333288, + "grad_norm": 0.20919764440849264, + "learning_rate": 1.9422715512828248e-05, + "loss": 0.3331, + "num_tokens": 632434149.0, + "step": 829 + }, + { + "epoch": 1.1324613529920091, + "grad_norm": 0.23389433499368334, + "learning_rate": 1.9420962595082396e-05, + "loss": 0.3164, + "num_tokens": 633144885.0, + "step": 830 + }, + { + "epoch": 1.1338269339506897, + "grad_norm": 0.2295696284022184, + "learning_rate": 1.9419207108465538e-05, + "loss": 0.3306, + "num_tokens": 633828564.0, + "step": 831 + }, + { + "epoch": 1.1351925149093702, + "grad_norm": 0.22530533159356056, + "learning_rate": 1.94174490535132e-05, + "loss": 0.3394, + "num_tokens": 634554740.0, + "step": 832 + }, + { + "epoch": 1.1365580958680508, + "grad_norm": 0.20058206707021065, + "learning_rate": 1.941568843076168e-05, + "loss": 0.3435, + "num_tokens": 635271883.0, + "step": 833 + }, + { + "epoch": 1.1379236768267313, + "grad_norm": 0.23175101701616693, + "learning_rate": 1.9413925240748074e-05, + "loss": 0.353, + "num_tokens": 635992085.0, + "step": 834 + }, + { + "epoch": 1.1392892577854117, + "grad_norm": 0.22536288869732607, + "learning_rate": 1.9412159484010254e-05, + "loss": 0.3542, + "num_tokens": 636717912.0, + "step": 835 + }, + { + "epoch": 1.1406548387440922, + "grad_norm": 0.22346931543715234, + "learning_rate": 1.941039116108687e-05, + "loss": 0.3402, + "num_tokens": 637491230.0, + "step": 836 + }, + { + "epoch": 1.1420204197027728, + "grad_norm": 0.21597589067749015, + "learning_rate": 1.940862027251737e-05, + "loss": 0.3432, + "num_tokens": 638285530.0, + "step": 837 + }, + { + "epoch": 1.1433860006614533, + "grad_norm": 0.21226927129186632, + "learning_rate": 1.940684681884197e-05, + "loss": 0.3394, + "num_tokens": 639051804.0, + "step": 838 + }, + { + "epoch": 1.1447515816201337, + "grad_norm": 0.21502298436100398, + "learning_rate": 1.9405070800601677e-05, + "loss": 0.351, + "num_tokens": 639875837.0, + "step": 839 + }, + { + "epoch": 1.1461171625788142, + "grad_norm": 0.22064724264056018, + "learning_rate": 1.9403292218338276e-05, + "loss": 0.3503, + "num_tokens": 640615235.0, + "step": 840 + }, + { + "epoch": 1.1474827435374948, + "grad_norm": 0.2026653755473225, + "learning_rate": 1.940151107259433e-05, + "loss": 0.3435, + "num_tokens": 641400053.0, + "step": 841 + }, + { + "epoch": 1.1488483244961754, + "grad_norm": 0.22108893109365524, + "learning_rate": 1.93997273639132e-05, + "loss": 0.3396, + "num_tokens": 642184918.0, + "step": 842 + }, + { + "epoch": 1.150213905454856, + "grad_norm": 0.1960884338704318, + "learning_rate": 1.939794109283901e-05, + "loss": 0.3438, + "num_tokens": 642971503.0, + "step": 843 + }, + { + "epoch": 1.1515794864135362, + "grad_norm": 0.2257429508864646, + "learning_rate": 1.9396152259916676e-05, + "loss": 0.3593, + "num_tokens": 643755043.0, + "step": 844 + }, + { + "epoch": 1.1529450673722168, + "grad_norm": 0.21499626799680963, + "learning_rate": 1.9394360865691896e-05, + "loss": 0.3539, + "num_tokens": 644582058.0, + "step": 845 + }, + { + "epoch": 1.1543106483308974, + "grad_norm": 0.2304590106792569, + "learning_rate": 1.939256691071114e-05, + "loss": 0.3326, + "num_tokens": 645381828.0, + "step": 846 + }, + { + "epoch": 1.155676229289578, + "grad_norm": 0.21592350712455738, + "learning_rate": 1.9390770395521674e-05, + "loss": 0.338, + "num_tokens": 646206644.0, + "step": 847 + }, + { + "epoch": 1.1570418102482583, + "grad_norm": 0.2529983356660823, + "learning_rate": 1.9388971320671533e-05, + "loss": 0.3424, + "num_tokens": 646917687.0, + "step": 848 + }, + { + "epoch": 1.1584073912069388, + "grad_norm": 0.2241649505814211, + "learning_rate": 1.938716968670954e-05, + "loss": 0.3486, + "num_tokens": 647662144.0, + "step": 849 + }, + { + "epoch": 1.1597729721656194, + "grad_norm": 0.2163814578235409, + "learning_rate": 1.938536549418529e-05, + "loss": 0.3494, + "num_tokens": 648424916.0, + "step": 850 + }, + { + "epoch": 1.1611385531243, + "grad_norm": 0.24955187239088902, + "learning_rate": 1.9383558743649168e-05, + "loss": 0.3584, + "num_tokens": 649156600.0, + "step": 851 + }, + { + "epoch": 1.1625041340829805, + "grad_norm": 0.23446575168126618, + "learning_rate": 1.9381749435652337e-05, + "loss": 0.3536, + "num_tokens": 649917916.0, + "step": 852 + }, + { + "epoch": 1.1638697150416608, + "grad_norm": 0.22634812733227788, + "learning_rate": 1.9379937570746733e-05, + "loss": 0.3409, + "num_tokens": 650642497.0, + "step": 853 + }, + { + "epoch": 1.1652352960003414, + "grad_norm": 0.2307793578805773, + "learning_rate": 1.9378123149485082e-05, + "loss": 0.3461, + "num_tokens": 651386219.0, + "step": 854 + }, + { + "epoch": 1.166600876959022, + "grad_norm": 0.23303330781466192, + "learning_rate": 1.937630617242088e-05, + "loss": 0.3594, + "num_tokens": 652102809.0, + "step": 855 + }, + { + "epoch": 1.1679664579177025, + "grad_norm": 0.21770567500218296, + "learning_rate": 1.9374486640108416e-05, + "loss": 0.346, + "num_tokens": 652873368.0, + "step": 856 + }, + { + "epoch": 1.1693320388763828, + "grad_norm": 0.2132618376781528, + "learning_rate": 1.9372664553102743e-05, + "loss": 0.361, + "num_tokens": 653648340.0, + "step": 857 + }, + { + "epoch": 1.1706976198350634, + "grad_norm": 0.20701407389920137, + "learning_rate": 1.9370839911959708e-05, + "loss": 0.3343, + "num_tokens": 654442924.0, + "step": 858 + }, + { + "epoch": 1.172063200793744, + "grad_norm": 0.2758451261437571, + "learning_rate": 1.9369012717235922e-05, + "loss": 0.3398, + "num_tokens": 655195472.0, + "step": 859 + }, + { + "epoch": 1.1734287817524245, + "grad_norm": 0.20518300542704662, + "learning_rate": 1.9367182969488792e-05, + "loss": 0.356, + "num_tokens": 655973657.0, + "step": 860 + }, + { + "epoch": 1.174794362711105, + "grad_norm": 0.23288096668252745, + "learning_rate": 1.9365350669276482e-05, + "loss": 0.3655, + "num_tokens": 656753462.0, + "step": 861 + }, + { + "epoch": 1.1761599436697854, + "grad_norm": 0.2257984443735613, + "learning_rate": 1.9363515817157963e-05, + "loss": 0.3396, + "num_tokens": 657577877.0, + "step": 862 + }, + { + "epoch": 1.177525524628466, + "grad_norm": 0.2094542616694453, + "learning_rate": 1.936167841369296e-05, + "loss": 0.3403, + "num_tokens": 658374789.0, + "step": 863 + }, + { + "epoch": 1.1788911055871465, + "grad_norm": 0.18789618185999296, + "learning_rate": 1.9359838459441985e-05, + "loss": 0.3248, + "num_tokens": 659115923.0, + "step": 864 + }, + { + "epoch": 1.180256686545827, + "grad_norm": 0.21942954207411852, + "learning_rate": 1.935799595496633e-05, + "loss": 0.3385, + "num_tokens": 659873530.0, + "step": 865 + }, + { + "epoch": 1.1816222675045074, + "grad_norm": 0.2079256007257021, + "learning_rate": 1.935615090082806e-05, + "loss": 0.363, + "num_tokens": 660687369.0, + "step": 866 + }, + { + "epoch": 1.182987848463188, + "grad_norm": 0.22514594796201975, + "learning_rate": 1.935430329759003e-05, + "loss": 0.3467, + "num_tokens": 661457954.0, + "step": 867 + }, + { + "epoch": 1.1843534294218685, + "grad_norm": 0.18731723865554684, + "learning_rate": 1.9352453145815854e-05, + "loss": 0.3626, + "num_tokens": 662210300.0, + "step": 868 + }, + { + "epoch": 1.185719010380549, + "grad_norm": 0.22170893892455693, + "learning_rate": 1.935060044606994e-05, + "loss": 0.3322, + "num_tokens": 662931119.0, + "step": 869 + }, + { + "epoch": 1.1870845913392296, + "grad_norm": 0.1992673749908861, + "learning_rate": 1.934874519891746e-05, + "loss": 0.3487, + "num_tokens": 663663455.0, + "step": 870 + }, + { + "epoch": 1.18845017229791, + "grad_norm": 0.20808166381530568, + "learning_rate": 1.934688740492438e-05, + "loss": 0.342, + "num_tokens": 664534798.0, + "step": 871 + }, + { + "epoch": 1.1898157532565905, + "grad_norm": 0.22087488348609283, + "learning_rate": 1.934502706465742e-05, + "loss": 0.3452, + "num_tokens": 665300874.0, + "step": 872 + }, + { + "epoch": 1.191181334215271, + "grad_norm": 0.23784556953514013, + "learning_rate": 1.9343164178684093e-05, + "loss": 0.3692, + "num_tokens": 666005267.0, + "step": 873 + }, + { + "epoch": 1.1925469151739516, + "grad_norm": 0.20292370579501118, + "learning_rate": 1.934129874757269e-05, + "loss": 0.337, + "num_tokens": 666734740.0, + "step": 874 + }, + { + "epoch": 1.193912496132632, + "grad_norm": 0.2217411028595786, + "learning_rate": 1.933943077189227e-05, + "loss": 0.3448, + "num_tokens": 667492731.0, + "step": 875 + }, + { + "epoch": 1.1952780770913125, + "grad_norm": 0.21870724777646677, + "learning_rate": 1.9337560252212673e-05, + "loss": 0.3277, + "num_tokens": 668243503.0, + "step": 876 + }, + { + "epoch": 1.196643658049993, + "grad_norm": 0.20457576767489188, + "learning_rate": 1.933568718910451e-05, + "loss": 0.3522, + "num_tokens": 669092752.0, + "step": 877 + }, + { + "epoch": 1.1980092390086736, + "grad_norm": 0.22226510666272514, + "learning_rate": 1.9333811583139173e-05, + "loss": 0.3582, + "num_tokens": 669851085.0, + "step": 878 + }, + { + "epoch": 1.1993748199673542, + "grad_norm": 0.21376895191728973, + "learning_rate": 1.933193343488883e-05, + "loss": 0.3332, + "num_tokens": 670602523.0, + "step": 879 + }, + { + "epoch": 1.2007404009260345, + "grad_norm": 0.1937378493811625, + "learning_rate": 1.9330052744926424e-05, + "loss": 0.3436, + "num_tokens": 671411552.0, + "step": 880 + }, + { + "epoch": 1.202105981884715, + "grad_norm": 0.20017633469468055, + "learning_rate": 1.9328169513825664e-05, + "loss": 0.3446, + "num_tokens": 672193651.0, + "step": 881 + }, + { + "epoch": 1.2034715628433956, + "grad_norm": 0.2200322586900562, + "learning_rate": 1.932628374216105e-05, + "loss": 0.3523, + "num_tokens": 672976492.0, + "step": 882 + }, + { + "epoch": 1.2048371438020762, + "grad_norm": 0.21024693808266928, + "learning_rate": 1.9324395430507847e-05, + "loss": 0.3384, + "num_tokens": 673659643.0, + "step": 883 + }, + { + "epoch": 1.2062027247607565, + "grad_norm": 0.2082678406004933, + "learning_rate": 1.9322504579442098e-05, + "loss": 0.3336, + "num_tokens": 674453313.0, + "step": 884 + }, + { + "epoch": 1.207568305719437, + "grad_norm": 0.22541917650661503, + "learning_rate": 1.9320611189540616e-05, + "loss": 0.3376, + "num_tokens": 675212051.0, + "step": 885 + }, + { + "epoch": 1.2089338866781176, + "grad_norm": 0.2054400878769126, + "learning_rate": 1.9318715261381e-05, + "loss": 0.3645, + "num_tokens": 675930971.0, + "step": 886 + }, + { + "epoch": 1.2102994676367982, + "grad_norm": 0.22583871319409823, + "learning_rate": 1.931681679554161e-05, + "loss": 0.3591, + "num_tokens": 676632984.0, + "step": 887 + }, + { + "epoch": 1.2116650485954787, + "grad_norm": 0.20963858343378733, + "learning_rate": 1.931491579260158e-05, + "loss": 0.3383, + "num_tokens": 677333340.0, + "step": 888 + }, + { + "epoch": 1.213030629554159, + "grad_norm": 0.20935971520820942, + "learning_rate": 1.9313012253140833e-05, + "loss": 0.3571, + "num_tokens": 678129113.0, + "step": 889 + }, + { + "epoch": 1.2143962105128396, + "grad_norm": 0.18888913938880342, + "learning_rate": 1.931110617774006e-05, + "loss": 0.3483, + "num_tokens": 678869608.0, + "step": 890 + }, + { + "epoch": 1.2157617914715202, + "grad_norm": 0.2263753363047279, + "learning_rate": 1.930919756698071e-05, + "loss": 0.3512, + "num_tokens": 679553554.0, + "step": 891 + }, + { + "epoch": 1.2171273724302007, + "grad_norm": 0.20681892978243827, + "learning_rate": 1.9307286421445023e-05, + "loss": 0.345, + "num_tokens": 680422050.0, + "step": 892 + }, + { + "epoch": 1.218492953388881, + "grad_norm": 0.20107974222576824, + "learning_rate": 1.9305372741716008e-05, + "loss": 0.3525, + "num_tokens": 681202885.0, + "step": 893 + }, + { + "epoch": 1.2198585343475616, + "grad_norm": 0.20455955055154176, + "learning_rate": 1.9303456528377444e-05, + "loss": 0.3405, + "num_tokens": 681955579.0, + "step": 894 + }, + { + "epoch": 1.2212241153062422, + "grad_norm": 0.20056820706659426, + "learning_rate": 1.9301537782013884e-05, + "loss": 0.3393, + "num_tokens": 682668266.0, + "step": 895 + }, + { + "epoch": 1.2225896962649228, + "grad_norm": 0.2287317027634672, + "learning_rate": 1.9299616503210657e-05, + "loss": 0.3487, + "num_tokens": 683406434.0, + "step": 896 + }, + { + "epoch": 1.2239552772236033, + "grad_norm": 0.2135978474830208, + "learning_rate": 1.929769269255386e-05, + "loss": 0.3473, + "num_tokens": 684209879.0, + "step": 897 + }, + { + "epoch": 1.2253208581822836, + "grad_norm": 0.21593332622963474, + "learning_rate": 1.929576635063037e-05, + "loss": 0.3464, + "num_tokens": 684902408.0, + "step": 898 + }, + { + "epoch": 1.2266864391409642, + "grad_norm": 0.20679704084756928, + "learning_rate": 1.929383747802782e-05, + "loss": 0.3487, + "num_tokens": 685761392.0, + "step": 899 + }, + { + "epoch": 1.2280520200996448, + "grad_norm": 0.21172760667058732, + "learning_rate": 1.929190607533463e-05, + "loss": 0.3286, + "num_tokens": 686535283.0, + "step": 900 + }, + { + "epoch": 1.2294176010583253, + "grad_norm": 0.21559225156061718, + "learning_rate": 1.9289972143139993e-05, + "loss": 0.3484, + "num_tokens": 687252380.0, + "step": 901 + }, + { + "epoch": 1.2307831820170057, + "grad_norm": 0.206461435190598, + "learning_rate": 1.928803568203386e-05, + "loss": 0.344, + "num_tokens": 688013226.0, + "step": 902 + }, + { + "epoch": 1.2321487629756862, + "grad_norm": 0.249320720317246, + "learning_rate": 1.9286096692606966e-05, + "loss": 0.3525, + "num_tokens": 688722910.0, + "step": 903 + }, + { + "epoch": 1.2335143439343668, + "grad_norm": 0.22298578762211163, + "learning_rate": 1.928415517545081e-05, + "loss": 0.3632, + "num_tokens": 689480695.0, + "step": 904 + }, + { + "epoch": 1.2348799248930473, + "grad_norm": 0.21398907317964638, + "learning_rate": 1.9282211131157668e-05, + "loss": 0.3529, + "num_tokens": 690231910.0, + "step": 905 + }, + { + "epoch": 1.2362455058517279, + "grad_norm": 0.2159969296053859, + "learning_rate": 1.928026456032058e-05, + "loss": 0.3539, + "num_tokens": 691017158.0, + "step": 906 + }, + { + "epoch": 1.2376110868104082, + "grad_norm": 0.2015170949291365, + "learning_rate": 1.9278315463533365e-05, + "loss": 0.3472, + "num_tokens": 691701734.0, + "step": 907 + }, + { + "epoch": 1.2389766677690888, + "grad_norm": 0.24649108499207192, + "learning_rate": 1.9276363841390603e-05, + "loss": 0.3396, + "num_tokens": 692576491.0, + "step": 908 + }, + { + "epoch": 1.2403422487277693, + "grad_norm": 0.2026259469053315, + "learning_rate": 1.9274409694487654e-05, + "loss": 0.3529, + "num_tokens": 693393118.0, + "step": 909 + }, + { + "epoch": 1.2417078296864499, + "grad_norm": 0.19428982705829625, + "learning_rate": 1.927245302342064e-05, + "loss": 0.3341, + "num_tokens": 694149888.0, + "step": 910 + }, + { + "epoch": 1.2430734106451302, + "grad_norm": 0.22577262738699272, + "learning_rate": 1.9270493828786457e-05, + "loss": 0.3489, + "num_tokens": 694908938.0, + "step": 911 + }, + { + "epoch": 1.2444389916038108, + "grad_norm": 0.1982661779929654, + "learning_rate": 1.9268532111182772e-05, + "loss": 0.34, + "num_tokens": 695717201.0, + "step": 912 + }, + { + "epoch": 1.2458045725624913, + "grad_norm": 0.19791120584715394, + "learning_rate": 1.9266567871208022e-05, + "loss": 0.3331, + "num_tokens": 696518422.0, + "step": 913 + }, + { + "epoch": 1.2471701535211719, + "grad_norm": 0.21571988790206187, + "learning_rate": 1.9264601109461412e-05, + "loss": 0.3339, + "num_tokens": 697253730.0, + "step": 914 + }, + { + "epoch": 1.2485357344798524, + "grad_norm": 0.2126983603731703, + "learning_rate": 1.9262631826542904e-05, + "loss": 0.3313, + "num_tokens": 698006536.0, + "step": 915 + }, + { + "epoch": 1.2499013154385328, + "grad_norm": 0.21342653764012828, + "learning_rate": 1.9260660023053256e-05, + "loss": 0.3295, + "num_tokens": 698712459.0, + "step": 916 + }, + { + "epoch": 1.2512668963972133, + "grad_norm": 0.218368896243392, + "learning_rate": 1.9258685699593972e-05, + "loss": 0.3273, + "num_tokens": 699421995.0, + "step": 917 + }, + { + "epoch": 1.252632477355894, + "grad_norm": 0.22264291582529575, + "learning_rate": 1.9256708856767338e-05, + "loss": 0.3572, + "num_tokens": 700254656.0, + "step": 918 + }, + { + "epoch": 1.2539980583145744, + "grad_norm": 0.21611845504912808, + "learning_rate": 1.9254729495176393e-05, + "loss": 0.3241, + "num_tokens": 700967683.0, + "step": 919 + }, + { + "epoch": 1.2553636392732548, + "grad_norm": 0.18863129681747487, + "learning_rate": 1.9252747615424966e-05, + "loss": 0.3277, + "num_tokens": 701725427.0, + "step": 920 + }, + { + "epoch": 1.2567292202319353, + "grad_norm": 0.22201479546361255, + "learning_rate": 1.9250763218117636e-05, + "loss": 0.3341, + "num_tokens": 702515753.0, + "step": 921 + }, + { + "epoch": 1.258094801190616, + "grad_norm": 0.2161054952344156, + "learning_rate": 1.9248776303859757e-05, + "loss": 0.3445, + "num_tokens": 703312570.0, + "step": 922 + }, + { + "epoch": 1.2594603821492965, + "grad_norm": 0.18451489888415928, + "learning_rate": 1.924678687325745e-05, + "loss": 0.345, + "num_tokens": 704075881.0, + "step": 923 + }, + { + "epoch": 1.260825963107977, + "grad_norm": 0.237986494280731, + "learning_rate": 1.924479492691761e-05, + "loss": 0.3517, + "num_tokens": 704881821.0, + "step": 924 + }, + { + "epoch": 1.2621915440666573, + "grad_norm": 0.20009995110294146, + "learning_rate": 1.924280046544789e-05, + "loss": 0.3589, + "num_tokens": 705636460.0, + "step": 925 + }, + { + "epoch": 1.263557125025338, + "grad_norm": 0.21559373028265322, + "learning_rate": 1.9240803489456713e-05, + "loss": 0.3387, + "num_tokens": 706282807.0, + "step": 926 + }, + { + "epoch": 1.2649227059840185, + "grad_norm": 0.2460565066724444, + "learning_rate": 1.923880399955327e-05, + "loss": 0.3506, + "num_tokens": 707084446.0, + "step": 927 + }, + { + "epoch": 1.266288286942699, + "grad_norm": 0.22862353852431203, + "learning_rate": 1.9236801996347513e-05, + "loss": 0.334, + "num_tokens": 707811973.0, + "step": 928 + }, + { + "epoch": 1.2676538679013793, + "grad_norm": 0.22059963119861575, + "learning_rate": 1.923479748045018e-05, + "loss": 0.3487, + "num_tokens": 708637101.0, + "step": 929 + }, + { + "epoch": 1.26901944886006, + "grad_norm": 0.19517680300458531, + "learning_rate": 1.923279045247275e-05, + "loss": 0.3437, + "num_tokens": 709439690.0, + "step": 930 + }, + { + "epoch": 1.2703850298187405, + "grad_norm": 0.2479375954572112, + "learning_rate": 1.9230780913027484e-05, + "loss": 0.3368, + "num_tokens": 710157194.0, + "step": 931 + }, + { + "epoch": 1.271750610777421, + "grad_norm": 0.21771506885320027, + "learning_rate": 1.922876886272741e-05, + "loss": 0.3519, + "num_tokens": 710876038.0, + "step": 932 + }, + { + "epoch": 1.2731161917361016, + "grad_norm": 0.20079152511242526, + "learning_rate": 1.9226754302186313e-05, + "loss": 0.325, + "num_tokens": 711629721.0, + "step": 933 + }, + { + "epoch": 1.274481772694782, + "grad_norm": 0.24386219741255047, + "learning_rate": 1.922473723201875e-05, + "loss": 0.3414, + "num_tokens": 712408387.0, + "step": 934 + }, + { + "epoch": 1.2758473536534625, + "grad_norm": 0.24175223401582388, + "learning_rate": 1.922271765284004e-05, + "loss": 0.3529, + "num_tokens": 713231205.0, + "step": 935 + }, + { + "epoch": 1.277212934612143, + "grad_norm": 0.21176233406611686, + "learning_rate": 1.9220695565266265e-05, + "loss": 0.3357, + "num_tokens": 714025276.0, + "step": 936 + }, + { + "epoch": 1.2785785155708236, + "grad_norm": 0.19314126072160737, + "learning_rate": 1.9218670969914284e-05, + "loss": 0.3482, + "num_tokens": 714757577.0, + "step": 937 + }, + { + "epoch": 1.279944096529504, + "grad_norm": 0.22150036668126125, + "learning_rate": 1.921664386740171e-05, + "loss": 0.3379, + "num_tokens": 715553807.0, + "step": 938 + }, + { + "epoch": 1.2813096774881845, + "grad_norm": 0.2111040354418008, + "learning_rate": 1.9214614258346928e-05, + "loss": 0.3417, + "num_tokens": 716320648.0, + "step": 939 + }, + { + "epoch": 1.282675258446865, + "grad_norm": 0.2303164328764197, + "learning_rate": 1.9212582143369078e-05, + "loss": 0.3398, + "num_tokens": 717066607.0, + "step": 940 + }, + { + "epoch": 1.2840408394055456, + "grad_norm": 0.20879361256930948, + "learning_rate": 1.9210547523088073e-05, + "loss": 0.3603, + "num_tokens": 717798341.0, + "step": 941 + }, + { + "epoch": 1.2854064203642261, + "grad_norm": 0.23176279347249062, + "learning_rate": 1.920851039812459e-05, + "loss": 0.3432, + "num_tokens": 718655059.0, + "step": 942 + }, + { + "epoch": 1.2867720013229065, + "grad_norm": 0.20531457263298838, + "learning_rate": 1.9206470769100063e-05, + "loss": 0.3231, + "num_tokens": 719398983.0, + "step": 943 + }, + { + "epoch": 1.288137582281587, + "grad_norm": 0.20914640582929006, + "learning_rate": 1.9204428636636694e-05, + "loss": 0.349, + "num_tokens": 720137181.0, + "step": 944 + }, + { + "epoch": 1.2895031632402676, + "grad_norm": 0.21045564925200708, + "learning_rate": 1.9202384001357452e-05, + "loss": 0.3357, + "num_tokens": 720887396.0, + "step": 945 + }, + { + "epoch": 1.2908687441989481, + "grad_norm": 0.19777279937695008, + "learning_rate": 1.920033686388607e-05, + "loss": 0.3413, + "num_tokens": 721650294.0, + "step": 946 + }, + { + "epoch": 1.2922343251576285, + "grad_norm": 0.23003136386994227, + "learning_rate": 1.9198287224847034e-05, + "loss": 0.3347, + "num_tokens": 722358014.0, + "step": 947 + }, + { + "epoch": 1.293599906116309, + "grad_norm": 0.21724007875928364, + "learning_rate": 1.9196235084865603e-05, + "loss": 0.3644, + "num_tokens": 723174884.0, + "step": 948 + }, + { + "epoch": 1.2949654870749896, + "grad_norm": 0.22242812289006225, + "learning_rate": 1.9194180444567796e-05, + "loss": 0.3425, + "num_tokens": 723980683.0, + "step": 949 + }, + { + "epoch": 1.2963310680336702, + "grad_norm": 0.24235339752294088, + "learning_rate": 1.9192123304580398e-05, + "loss": 0.3631, + "num_tokens": 724784950.0, + "step": 950 + }, + { + "epoch": 1.2976966489923507, + "grad_norm": 0.2072722317309631, + "learning_rate": 1.9190063665530947e-05, + "loss": 0.3522, + "num_tokens": 725552248.0, + "step": 951 + }, + { + "epoch": 1.299062229951031, + "grad_norm": 0.2394038013464679, + "learning_rate": 1.918800152804776e-05, + "loss": 0.3562, + "num_tokens": 726376218.0, + "step": 952 + }, + { + "epoch": 1.3004278109097116, + "grad_norm": 0.22565292148704347, + "learning_rate": 1.9185936892759893e-05, + "loss": 0.3597, + "num_tokens": 727105372.0, + "step": 953 + }, + { + "epoch": 1.3017933918683922, + "grad_norm": 0.22320333296989056, + "learning_rate": 1.918386976029718e-05, + "loss": 0.3499, + "num_tokens": 727818025.0, + "step": 954 + }, + { + "epoch": 1.3031589728270727, + "grad_norm": 0.242177705754565, + "learning_rate": 1.9181800131290222e-05, + "loss": 0.3555, + "num_tokens": 728555653.0, + "step": 955 + }, + { + "epoch": 1.304524553785753, + "grad_norm": 0.20226389795968736, + "learning_rate": 1.9179728006370367e-05, + "loss": 0.3341, + "num_tokens": 729381507.0, + "step": 956 + }, + { + "epoch": 1.3058901347444336, + "grad_norm": 0.2153484183374385, + "learning_rate": 1.917765338616973e-05, + "loss": 0.3456, + "num_tokens": 730115528.0, + "step": 957 + }, + { + "epoch": 1.3072557157031142, + "grad_norm": 0.23322247288989956, + "learning_rate": 1.9175576271321188e-05, + "loss": 0.343, + "num_tokens": 730878062.0, + "step": 958 + }, + { + "epoch": 1.3086212966617947, + "grad_norm": 0.1991822899683223, + "learning_rate": 1.917349666245838e-05, + "loss": 0.3565, + "num_tokens": 731664199.0, + "step": 959 + }, + { + "epoch": 1.3099868776204753, + "grad_norm": 0.22924913081574538, + "learning_rate": 1.9171414560215705e-05, + "loss": 0.3272, + "num_tokens": 732329125.0, + "step": 960 + }, + { + "epoch": 1.3113524585791556, + "grad_norm": 0.20915980221914657, + "learning_rate": 1.9169329965228323e-05, + "loss": 0.3505, + "num_tokens": 733128714.0, + "step": 961 + }, + { + "epoch": 1.3127180395378362, + "grad_norm": 0.2223105490801981, + "learning_rate": 1.916724287813215e-05, + "loss": 0.3593, + "num_tokens": 733937293.0, + "step": 962 + }, + { + "epoch": 1.3140836204965167, + "grad_norm": 0.20308503885377469, + "learning_rate": 1.916515329956387e-05, + "loss": 0.3552, + "num_tokens": 734734607.0, + "step": 963 + }, + { + "epoch": 1.3154492014551973, + "grad_norm": 0.20820254378488895, + "learning_rate": 1.9163061230160918e-05, + "loss": 0.3377, + "num_tokens": 735442588.0, + "step": 964 + }, + { + "epoch": 1.3168147824138776, + "grad_norm": 0.21645143501370412, + "learning_rate": 1.9160966670561502e-05, + "loss": 0.3546, + "num_tokens": 736237349.0, + "step": 965 + }, + { + "epoch": 1.3181803633725582, + "grad_norm": 0.21580788857459463, + "learning_rate": 1.9158869621404572e-05, + "loss": 0.3551, + "num_tokens": 736953126.0, + "step": 966 + }, + { + "epoch": 1.3195459443312387, + "grad_norm": 0.20345921802707528, + "learning_rate": 1.915677008332985e-05, + "loss": 0.3447, + "num_tokens": 737782878.0, + "step": 967 + }, + { + "epoch": 1.3209115252899193, + "grad_norm": 0.21736050900413706, + "learning_rate": 1.915466805697782e-05, + "loss": 0.352, + "num_tokens": 738571143.0, + "step": 968 + }, + { + "epoch": 1.3222771062485998, + "grad_norm": 0.20380288687700202, + "learning_rate": 1.915256354298971e-05, + "loss": 0.3648, + "num_tokens": 739373505.0, + "step": 969 + }, + { + "epoch": 1.3236426872072802, + "grad_norm": 0.2187041565793758, + "learning_rate": 1.915045654200752e-05, + "loss": 0.3337, + "num_tokens": 740070006.0, + "step": 970 + }, + { + "epoch": 1.3250082681659607, + "grad_norm": 0.22980616847624716, + "learning_rate": 1.914834705467401e-05, + "loss": 0.3597, + "num_tokens": 740862451.0, + "step": 971 + }, + { + "epoch": 1.3263738491246413, + "grad_norm": 0.205813085401544, + "learning_rate": 1.9146235081632687e-05, + "loss": 0.3218, + "num_tokens": 741583865.0, + "step": 972 + }, + { + "epoch": 1.3277394300833218, + "grad_norm": 0.20012872057025288, + "learning_rate": 1.9144120623527824e-05, + "loss": 0.3373, + "num_tokens": 742408521.0, + "step": 973 + }, + { + "epoch": 1.3291050110420022, + "grad_norm": 0.21608061460844172, + "learning_rate": 1.914200368100445e-05, + "loss": 0.3481, + "num_tokens": 743160985.0, + "step": 974 + }, + { + "epoch": 1.3304705920006827, + "grad_norm": 0.2352023540578385, + "learning_rate": 1.9139884254708353e-05, + "loss": 0.3564, + "num_tokens": 743904992.0, + "step": 975 + }, + { + "epoch": 1.3318361729593633, + "grad_norm": 0.19265647665416, + "learning_rate": 1.9137762345286077e-05, + "loss": 0.3449, + "num_tokens": 744663505.0, + "step": 976 + }, + { + "epoch": 1.3332017539180439, + "grad_norm": 0.20377383426491888, + "learning_rate": 1.913563795338493e-05, + "loss": 0.3551, + "num_tokens": 745491376.0, + "step": 977 + }, + { + "epoch": 1.3345673348767244, + "grad_norm": 0.21588129828653063, + "learning_rate": 1.9133511079652965e-05, + "loss": 0.3298, + "num_tokens": 746254832.0, + "step": 978 + }, + { + "epoch": 1.3359329158354047, + "grad_norm": 0.21577923760467208, + "learning_rate": 1.9131381724739e-05, + "loss": 0.3469, + "num_tokens": 747070818.0, + "step": 979 + }, + { + "epoch": 1.3372984967940853, + "grad_norm": 0.20218301388184165, + "learning_rate": 1.9129249889292614e-05, + "loss": 0.3385, + "num_tokens": 747768715.0, + "step": 980 + }, + { + "epoch": 1.3386640777527659, + "grad_norm": 0.1903518976197912, + "learning_rate": 1.912711557396413e-05, + "loss": 0.3688, + "num_tokens": 748595355.0, + "step": 981 + }, + { + "epoch": 1.3400296587114464, + "grad_norm": 0.2088139640674936, + "learning_rate": 1.9124978779404648e-05, + "loss": 0.3405, + "num_tokens": 749387982.0, + "step": 982 + }, + { + "epoch": 1.3413952396701267, + "grad_norm": 0.21099392343115447, + "learning_rate": 1.9122839506265998e-05, + "loss": 0.3317, + "num_tokens": 750111930.0, + "step": 983 + }, + { + "epoch": 1.3427608206288073, + "grad_norm": 0.2200938314581359, + "learning_rate": 1.9120697755200786e-05, + "loss": 0.3436, + "num_tokens": 750837799.0, + "step": 984 + }, + { + "epoch": 1.3441264015874879, + "grad_norm": 0.1921585652015767, + "learning_rate": 1.9118553526862365e-05, + "loss": 0.3425, + "num_tokens": 751591219.0, + "step": 985 + }, + { + "epoch": 1.3454919825461684, + "grad_norm": 0.2108152644594403, + "learning_rate": 1.911640682190485e-05, + "loss": 0.3397, + "num_tokens": 752313245.0, + "step": 986 + }, + { + "epoch": 1.346857563504849, + "grad_norm": 0.1991539069981807, + "learning_rate": 1.9114257640983106e-05, + "loss": 0.3636, + "num_tokens": 753064329.0, + "step": 987 + }, + { + "epoch": 1.3482231444635293, + "grad_norm": 0.18906600873651339, + "learning_rate": 1.9112105984752752e-05, + "loss": 0.3561, + "num_tokens": 753952656.0, + "step": 988 + }, + { + "epoch": 1.3495887254222099, + "grad_norm": 0.19434232533149676, + "learning_rate": 1.9109951853870168e-05, + "loss": 0.3243, + "num_tokens": 754612682.0, + "step": 989 + }, + { + "epoch": 1.3509543063808904, + "grad_norm": 0.20198763197861377, + "learning_rate": 1.9107795248992484e-05, + "loss": 0.3485, + "num_tokens": 755433330.0, + "step": 990 + }, + { + "epoch": 1.352319887339571, + "grad_norm": 0.18145603909795255, + "learning_rate": 1.910563617077759e-05, + "loss": 0.3357, + "num_tokens": 756189698.0, + "step": 991 + }, + { + "epoch": 1.3536854682982513, + "grad_norm": 0.20287739259347587, + "learning_rate": 1.9103474619884125e-05, + "loss": 0.3333, + "num_tokens": 756913299.0, + "step": 992 + }, + { + "epoch": 1.3550510492569319, + "grad_norm": 0.21332552684792763, + "learning_rate": 1.9101310596971488e-05, + "loss": 0.3438, + "num_tokens": 757647161.0, + "step": 993 + }, + { + "epoch": 1.3564166302156124, + "grad_norm": 0.19123283867132806, + "learning_rate": 1.9099144102699823e-05, + "loss": 0.3431, + "num_tokens": 758342215.0, + "step": 994 + }, + { + "epoch": 1.357782211174293, + "grad_norm": 0.210306784581258, + "learning_rate": 1.909697513773004e-05, + "loss": 0.3438, + "num_tokens": 759153206.0, + "step": 995 + }, + { + "epoch": 1.3591477921329735, + "grad_norm": 0.2104569929114126, + "learning_rate": 1.9094803702723785e-05, + "loss": 0.3366, + "num_tokens": 759885538.0, + "step": 996 + }, + { + "epoch": 1.3605133730916539, + "grad_norm": 0.20557835447448367, + "learning_rate": 1.9092629798343484e-05, + "loss": 0.3439, + "num_tokens": 760625195.0, + "step": 997 + }, + { + "epoch": 1.3618789540503344, + "grad_norm": 0.21615516994462022, + "learning_rate": 1.909045342525229e-05, + "loss": 0.3548, + "num_tokens": 761329022.0, + "step": 998 + }, + { + "epoch": 1.363244535009015, + "grad_norm": 0.2108311085038829, + "learning_rate": 1.9088274584114124e-05, + "loss": 0.3402, + "num_tokens": 762100819.0, + "step": 999 + }, + { + "epoch": 1.3646101159676955, + "grad_norm": 0.20812136037000667, + "learning_rate": 1.9086093275593656e-05, + "loss": 0.3534, + "num_tokens": 762939361.0, + "step": 1000 + }, + { + "epoch": 1.3659756969263759, + "grad_norm": 0.19308185592020102, + "learning_rate": 1.9083909500356307e-05, + "loss": 0.354, + "num_tokens": 763711552.0, + "step": 1001 + }, + { + "epoch": 1.3673412778850564, + "grad_norm": 0.20355380923101798, + "learning_rate": 1.9081723259068256e-05, + "loss": 0.3248, + "num_tokens": 764443102.0, + "step": 1002 + }, + { + "epoch": 1.368706858843737, + "grad_norm": 0.19772939267999595, + "learning_rate": 1.907953455239643e-05, + "loss": 0.3506, + "num_tokens": 765222188.0, + "step": 1003 + }, + { + "epoch": 1.3700724398024176, + "grad_norm": 0.19291879892121955, + "learning_rate": 1.9077343381008503e-05, + "loss": 0.3478, + "num_tokens": 766059302.0, + "step": 1004 + }, + { + "epoch": 1.371438020761098, + "grad_norm": 0.20708226359133314, + "learning_rate": 1.9075149745572912e-05, + "loss": 0.3436, + "num_tokens": 766772992.0, + "step": 1005 + }, + { + "epoch": 1.3728036017197784, + "grad_norm": 0.20762474121703964, + "learning_rate": 1.9072953646758837e-05, + "loss": 0.3449, + "num_tokens": 767544810.0, + "step": 1006 + }, + { + "epoch": 1.374169182678459, + "grad_norm": 0.20778799814333088, + "learning_rate": 1.9070755085236218e-05, + "loss": 0.3533, + "num_tokens": 768293750.0, + "step": 1007 + }, + { + "epoch": 1.3755347636371396, + "grad_norm": 0.19904617759650783, + "learning_rate": 1.9068554061675735e-05, + "loss": 0.3471, + "num_tokens": 769080589.0, + "step": 1008 + }, + { + "epoch": 1.3769003445958201, + "grad_norm": 0.2067066225634614, + "learning_rate": 1.9066350576748827e-05, + "loss": 0.3417, + "num_tokens": 769795210.0, + "step": 1009 + }, + { + "epoch": 1.3782659255545004, + "grad_norm": 0.18363191098514786, + "learning_rate": 1.9064144631127684e-05, + "loss": 0.3418, + "num_tokens": 770598994.0, + "step": 1010 + }, + { + "epoch": 1.379631506513181, + "grad_norm": 0.21060996646622115, + "learning_rate": 1.906193622548524e-05, + "loss": 0.3523, + "num_tokens": 771444771.0, + "step": 1011 + }, + { + "epoch": 1.3809970874718616, + "grad_norm": 0.200401063192149, + "learning_rate": 1.9059725360495188e-05, + "loss": 0.3641, + "num_tokens": 772224971.0, + "step": 1012 + }, + { + "epoch": 1.3823626684305421, + "grad_norm": 0.20553157568060929, + "learning_rate": 1.9057512036831962e-05, + "loss": 0.345, + "num_tokens": 773053413.0, + "step": 1013 + }, + { + "epoch": 1.3837282493892227, + "grad_norm": 0.21239282639640547, + "learning_rate": 1.905529625517076e-05, + "loss": 0.3489, + "num_tokens": 773815635.0, + "step": 1014 + }, + { + "epoch": 1.385093830347903, + "grad_norm": 0.20468928264581807, + "learning_rate": 1.9053078016187513e-05, + "loss": 0.3415, + "num_tokens": 774577434.0, + "step": 1015 + }, + { + "epoch": 1.3864594113065836, + "grad_norm": 0.19637590038453787, + "learning_rate": 1.905085732055891e-05, + "loss": 0.3402, + "num_tokens": 775300403.0, + "step": 1016 + }, + { + "epoch": 1.3878249922652641, + "grad_norm": 0.20795524875098276, + "learning_rate": 1.9048634168962394e-05, + "loss": 0.3496, + "num_tokens": 776072863.0, + "step": 1017 + }, + { + "epoch": 1.3891905732239447, + "grad_norm": 0.18297877011054348, + "learning_rate": 1.9046408562076152e-05, + "loss": 0.332, + "num_tokens": 776809411.0, + "step": 1018 + }, + { + "epoch": 1.390556154182625, + "grad_norm": 0.20619295381677805, + "learning_rate": 1.9044180500579116e-05, + "loss": 0.3413, + "num_tokens": 777602078.0, + "step": 1019 + }, + { + "epoch": 1.3919217351413056, + "grad_norm": 0.4318734834665932, + "learning_rate": 1.9041949985150972e-05, + "loss": 0.3437, + "num_tokens": 778411563.0, + "step": 1020 + }, + { + "epoch": 1.3932873160999861, + "grad_norm": 0.19705087454282383, + "learning_rate": 1.9039717016472158e-05, + "loss": 0.3424, + "num_tokens": 779214139.0, + "step": 1021 + }, + { + "epoch": 1.3946528970586667, + "grad_norm": 0.18491408596375214, + "learning_rate": 1.903748159522385e-05, + "loss": 0.3417, + "num_tokens": 779924509.0, + "step": 1022 + }, + { + "epoch": 1.3960184780173472, + "grad_norm": 0.21652287688885463, + "learning_rate": 1.9035243722087982e-05, + "loss": 0.3441, + "num_tokens": 780712634.0, + "step": 1023 + }, + { + "epoch": 1.3973840589760276, + "grad_norm": 0.20746999614545147, + "learning_rate": 1.9033003397747228e-05, + "loss": 0.3437, + "num_tokens": 781535885.0, + "step": 1024 + }, + { + "epoch": 1.3987496399347081, + "grad_norm": 0.20234658547269846, + "learning_rate": 1.9030760622885022e-05, + "loss": 0.347, + "num_tokens": 782325858.0, + "step": 1025 + }, + { + "epoch": 1.4001152208933887, + "grad_norm": 0.2095417444728844, + "learning_rate": 1.902851539818553e-05, + "loss": 0.3676, + "num_tokens": 783162812.0, + "step": 1026 + }, + { + "epoch": 1.4014808018520692, + "grad_norm": 0.23199469962397398, + "learning_rate": 1.9026267724333676e-05, + "loss": 0.3359, + "num_tokens": 783981985.0, + "step": 1027 + }, + { + "epoch": 1.4028463828107496, + "grad_norm": 0.20818371341430414, + "learning_rate": 1.9024017602015128e-05, + "loss": 0.3412, + "num_tokens": 784679182.0, + "step": 1028 + }, + { + "epoch": 1.4042119637694301, + "grad_norm": 0.20278938365081214, + "learning_rate": 1.90217650319163e-05, + "loss": 0.3461, + "num_tokens": 785526390.0, + "step": 1029 + }, + { + "epoch": 1.4055775447281107, + "grad_norm": 0.20608082457805832, + "learning_rate": 1.901951001472435e-05, + "loss": 0.3339, + "num_tokens": 786326316.0, + "step": 1030 + }, + { + "epoch": 1.4069431256867913, + "grad_norm": 0.1951729453129531, + "learning_rate": 1.9017252551127195e-05, + "loss": 0.3512, + "num_tokens": 787084834.0, + "step": 1031 + }, + { + "epoch": 1.4083087066454718, + "grad_norm": 0.21828481355306958, + "learning_rate": 1.9014992641813484e-05, + "loss": 0.3513, + "num_tokens": 787885803.0, + "step": 1032 + }, + { + "epoch": 1.4096742876041521, + "grad_norm": 0.19173953789797196, + "learning_rate": 1.901273028747262e-05, + "loss": 0.3365, + "num_tokens": 788689073.0, + "step": 1033 + }, + { + "epoch": 1.4110398685628327, + "grad_norm": 0.19112014861705404, + "learning_rate": 1.901046548879474e-05, + "loss": 0.3385, + "num_tokens": 789398548.0, + "step": 1034 + }, + { + "epoch": 1.4124054495215133, + "grad_norm": 0.26585117217622123, + "learning_rate": 1.9008198246470748e-05, + "loss": 0.3572, + "num_tokens": 790140215.0, + "step": 1035 + }, + { + "epoch": 1.4137710304801938, + "grad_norm": 0.22627088805654563, + "learning_rate": 1.9005928561192274e-05, + "loss": 0.3487, + "num_tokens": 790841488.0, + "step": 1036 + }, + { + "epoch": 1.4151366114388741, + "grad_norm": 0.23083922239152327, + "learning_rate": 1.900365643365171e-05, + "loss": 0.3398, + "num_tokens": 791541324.0, + "step": 1037 + }, + { + "epoch": 1.4165021923975547, + "grad_norm": 0.22632226794019863, + "learning_rate": 1.9001381864542167e-05, + "loss": 0.3459, + "num_tokens": 792334607.0, + "step": 1038 + }, + { + "epoch": 1.4178677733562353, + "grad_norm": 0.20692448370949737, + "learning_rate": 1.8999104854557534e-05, + "loss": 0.3537, + "num_tokens": 793048243.0, + "step": 1039 + }, + { + "epoch": 1.4192333543149158, + "grad_norm": 0.2005373224082317, + "learning_rate": 1.899682540439242e-05, + "loss": 0.3445, + "num_tokens": 793734926.0, + "step": 1040 + }, + { + "epoch": 1.4205989352735964, + "grad_norm": 0.22857768108250684, + "learning_rate": 1.8994543514742186e-05, + "loss": 0.363, + "num_tokens": 794421277.0, + "step": 1041 + }, + { + "epoch": 1.4219645162322767, + "grad_norm": 0.18133469861516482, + "learning_rate": 1.8992259186302944e-05, + "loss": 0.3398, + "num_tokens": 795229793.0, + "step": 1042 + }, + { + "epoch": 1.4233300971909573, + "grad_norm": 0.2041516211927476, + "learning_rate": 1.8989972419771537e-05, + "loss": 0.3482, + "num_tokens": 795997880.0, + "step": 1043 + }, + { + "epoch": 1.4246956781496378, + "grad_norm": 0.1820872206051627, + "learning_rate": 1.8987683215845562e-05, + "loss": 0.3359, + "num_tokens": 796782476.0, + "step": 1044 + }, + { + "epoch": 1.4260612591083184, + "grad_norm": 0.19818268218630272, + "learning_rate": 1.898539157522336e-05, + "loss": 0.338, + "num_tokens": 797568445.0, + "step": 1045 + }, + { + "epoch": 1.4274268400669987, + "grad_norm": 0.18863219153517585, + "learning_rate": 1.8983097498603996e-05, + "loss": 0.3268, + "num_tokens": 798369710.0, + "step": 1046 + }, + { + "epoch": 1.4287924210256793, + "grad_norm": 0.19205616586937002, + "learning_rate": 1.898080098668731e-05, + "loss": 0.3424, + "num_tokens": 799170218.0, + "step": 1047 + }, + { + "epoch": 1.4301580019843598, + "grad_norm": 0.1998242361005097, + "learning_rate": 1.897850204017386e-05, + "loss": 0.3414, + "num_tokens": 799962100.0, + "step": 1048 + }, + { + "epoch": 1.4315235829430404, + "grad_norm": 0.22636009043250874, + "learning_rate": 1.897620065976496e-05, + "loss": 0.3512, + "num_tokens": 800736291.0, + "step": 1049 + }, + { + "epoch": 1.432889163901721, + "grad_norm": 0.19773457209180703, + "learning_rate": 1.8973896846162655e-05, + "loss": 0.3237, + "num_tokens": 801479180.0, + "step": 1050 + }, + { + "epoch": 1.4342547448604013, + "grad_norm": 0.21245997240300052, + "learning_rate": 1.8971590600069743e-05, + "loss": 0.3399, + "num_tokens": 802317711.0, + "step": 1051 + }, + { + "epoch": 1.4356203258190818, + "grad_norm": 0.18548143963073324, + "learning_rate": 1.8969281922189763e-05, + "loss": 0.3331, + "num_tokens": 803208138.0, + "step": 1052 + }, + { + "epoch": 1.4369859067777624, + "grad_norm": 0.24584593132830695, + "learning_rate": 1.8966970813226983e-05, + "loss": 0.35, + "num_tokens": 803986681.0, + "step": 1053 + }, + { + "epoch": 1.438351487736443, + "grad_norm": 0.25225116012958304, + "learning_rate": 1.896465727388643e-05, + "loss": 0.3638, + "num_tokens": 804736757.0, + "step": 1054 + }, + { + "epoch": 1.4397170686951233, + "grad_norm": 0.21938336427996333, + "learning_rate": 1.8962341304873865e-05, + "loss": 0.3373, + "num_tokens": 805457947.0, + "step": 1055 + }, + { + "epoch": 1.4410826496538038, + "grad_norm": 0.23853847380707593, + "learning_rate": 1.8960022906895786e-05, + "loss": 0.3424, + "num_tokens": 806170211.0, + "step": 1056 + }, + { + "epoch": 1.4424482306124844, + "grad_norm": 0.22913980031888978, + "learning_rate": 1.8957702080659436e-05, + "loss": 0.3597, + "num_tokens": 806919908.0, + "step": 1057 + }, + { + "epoch": 1.443813811571165, + "grad_norm": 0.21253234599811305, + "learning_rate": 1.8955378826872805e-05, + "loss": 0.3536, + "num_tokens": 807742448.0, + "step": 1058 + }, + { + "epoch": 1.4451793925298455, + "grad_norm": 0.22252222173831698, + "learning_rate": 1.8953053146244607e-05, + "loss": 0.3266, + "num_tokens": 808524618.0, + "step": 1059 + }, + { + "epoch": 1.4465449734885258, + "grad_norm": 0.21971631808056, + "learning_rate": 1.8950725039484315e-05, + "loss": 0.3448, + "num_tokens": 809284663.0, + "step": 1060 + }, + { + "epoch": 1.4479105544472064, + "grad_norm": 0.20295447511599457, + "learning_rate": 1.8948394507302127e-05, + "loss": 0.3526, + "num_tokens": 810034681.0, + "step": 1061 + }, + { + "epoch": 1.449276135405887, + "grad_norm": 0.20275623953096, + "learning_rate": 1.894606155040899e-05, + "loss": 0.3361, + "num_tokens": 810838475.0, + "step": 1062 + }, + { + "epoch": 1.4506417163645675, + "grad_norm": 0.18406433706742742, + "learning_rate": 1.8943726169516593e-05, + "loss": 0.3339, + "num_tokens": 811591952.0, + "step": 1063 + }, + { + "epoch": 1.4520072973232478, + "grad_norm": 0.21222076328260056, + "learning_rate": 1.8941388365337353e-05, + "loss": 0.3458, + "num_tokens": 812338606.0, + "step": 1064 + }, + { + "epoch": 1.4533728782819284, + "grad_norm": 0.19844414388804144, + "learning_rate": 1.8939048138584433e-05, + "loss": 0.3479, + "num_tokens": 813084600.0, + "step": 1065 + }, + { + "epoch": 1.454738459240609, + "grad_norm": 0.23105250796925433, + "learning_rate": 1.893670548997174e-05, + "loss": 0.3429, + "num_tokens": 813846474.0, + "step": 1066 + }, + { + "epoch": 1.4561040401992895, + "grad_norm": 0.1746600741739495, + "learning_rate": 1.893436042021391e-05, + "loss": 0.3333, + "num_tokens": 814593720.0, + "step": 1067 + }, + { + "epoch": 1.45746962115797, + "grad_norm": 0.2272698091045512, + "learning_rate": 1.893201293002632e-05, + "loss": 0.357, + "num_tokens": 815413507.0, + "step": 1068 + }, + { + "epoch": 1.4588352021166504, + "grad_norm": 0.17944151383076679, + "learning_rate": 1.8929663020125098e-05, + "loss": 0.3456, + "num_tokens": 816145472.0, + "step": 1069 + }, + { + "epoch": 1.460200783075331, + "grad_norm": 0.2252011436909475, + "learning_rate": 1.8927310691227086e-05, + "loss": 0.3416, + "num_tokens": 816887616.0, + "step": 1070 + }, + { + "epoch": 1.4615663640340115, + "grad_norm": 0.20737394287463223, + "learning_rate": 1.8924955944049895e-05, + "loss": 0.3521, + "num_tokens": 817636306.0, + "step": 1071 + }, + { + "epoch": 1.462931944992692, + "grad_norm": 0.19180997158059387, + "learning_rate": 1.8922598779311835e-05, + "loss": 0.3249, + "num_tokens": 818354838.0, + "step": 1072 + }, + { + "epoch": 1.4642975259513724, + "grad_norm": 0.20760733410420126, + "learning_rate": 1.8920239197731994e-05, + "loss": 0.3272, + "num_tokens": 819108556.0, + "step": 1073 + }, + { + "epoch": 1.465663106910053, + "grad_norm": 0.20399193667365068, + "learning_rate": 1.8917877200030165e-05, + "loss": 0.3393, + "num_tokens": 819931541.0, + "step": 1074 + }, + { + "epoch": 1.4670286878687335, + "grad_norm": 0.20370843557232557, + "learning_rate": 1.8915512786926898e-05, + "loss": 0.3688, + "num_tokens": 820754766.0, + "step": 1075 + }, + { + "epoch": 1.468394268827414, + "grad_norm": 0.22237871067208817, + "learning_rate": 1.8913145959143474e-05, + "loss": 0.3393, + "num_tokens": 821496785.0, + "step": 1076 + }, + { + "epoch": 1.4697598497860946, + "grad_norm": 0.20609229354037506, + "learning_rate": 1.8910776717401903e-05, + "loss": 0.3624, + "num_tokens": 822266906.0, + "step": 1077 + }, + { + "epoch": 1.471125430744775, + "grad_norm": 0.20138522886539814, + "learning_rate": 1.890840506242495e-05, + "loss": 0.3269, + "num_tokens": 822963253.0, + "step": 1078 + }, + { + "epoch": 1.4724910117034555, + "grad_norm": 0.22859993003978582, + "learning_rate": 1.8906030994936088e-05, + "loss": 0.3387, + "num_tokens": 823680894.0, + "step": 1079 + }, + { + "epoch": 1.473856592662136, + "grad_norm": 0.18945100145312388, + "learning_rate": 1.890365451565955e-05, + "loss": 0.3399, + "num_tokens": 824416461.0, + "step": 1080 + }, + { + "epoch": 1.4752221736208166, + "grad_norm": 0.21215026118066474, + "learning_rate": 1.8901275625320304e-05, + "loss": 0.3433, + "num_tokens": 825187052.0, + "step": 1081 + }, + { + "epoch": 1.476587754579497, + "grad_norm": 0.20247565763683667, + "learning_rate": 1.8898894324644035e-05, + "loss": 0.3568, + "num_tokens": 825948377.0, + "step": 1082 + }, + { + "epoch": 1.4779533355381775, + "grad_norm": 0.20125433042539995, + "learning_rate": 1.8896510614357183e-05, + "loss": 0.3303, + "num_tokens": 826675665.0, + "step": 1083 + }, + { + "epoch": 1.479318916496858, + "grad_norm": 0.21527018032206333, + "learning_rate": 1.8894124495186908e-05, + "loss": 0.3406, + "num_tokens": 827442330.0, + "step": 1084 + }, + { + "epoch": 1.4806844974555387, + "grad_norm": 0.1924507722243901, + "learning_rate": 1.8891735967861116e-05, + "loss": 0.3468, + "num_tokens": 828225665.0, + "step": 1085 + }, + { + "epoch": 1.4820500784142192, + "grad_norm": 0.18722681766944713, + "learning_rate": 1.8889345033108443e-05, + "loss": 0.3646, + "num_tokens": 829039240.0, + "step": 1086 + }, + { + "epoch": 1.4834156593728995, + "grad_norm": 0.7483727239434015, + "learning_rate": 1.8886951691658258e-05, + "loss": 0.3586, + "num_tokens": 829803133.0, + "step": 1087 + }, + { + "epoch": 1.48478124033158, + "grad_norm": 0.24493737838005014, + "learning_rate": 1.8884555944240666e-05, + "loss": 0.3334, + "num_tokens": 830477988.0, + "step": 1088 + }, + { + "epoch": 1.4861468212902607, + "grad_norm": 0.20772852706927325, + "learning_rate": 1.888215779158651e-05, + "loss": 0.3259, + "num_tokens": 831142418.0, + "step": 1089 + }, + { + "epoch": 1.4875124022489412, + "grad_norm": 0.22001610076871084, + "learning_rate": 1.8879757234427357e-05, + "loss": 0.3506, + "num_tokens": 831952350.0, + "step": 1090 + }, + { + "epoch": 1.4888779832076215, + "grad_norm": 0.20751587118764553, + "learning_rate": 1.8877354273495515e-05, + "loss": 0.3511, + "num_tokens": 832718894.0, + "step": 1091 + }, + { + "epoch": 1.490243564166302, + "grad_norm": 0.21955051975737266, + "learning_rate": 1.8874948909524022e-05, + "loss": 0.3425, + "num_tokens": 833518145.0, + "step": 1092 + }, + { + "epoch": 1.4916091451249827, + "grad_norm": 0.20125206273827248, + "learning_rate": 1.887254114324665e-05, + "loss": 0.3382, + "num_tokens": 834282263.0, + "step": 1093 + }, + { + "epoch": 1.4929747260836632, + "grad_norm": 0.24605774509975437, + "learning_rate": 1.887013097539791e-05, + "loss": 0.3557, + "num_tokens": 835071186.0, + "step": 1094 + }, + { + "epoch": 1.4943403070423438, + "grad_norm": 0.21094922352229073, + "learning_rate": 1.8867718406713036e-05, + "loss": 0.3697, + "num_tokens": 835844139.0, + "step": 1095 + }, + { + "epoch": 1.495705888001024, + "grad_norm": 0.2196305845390831, + "learning_rate": 1.8865303437927996e-05, + "loss": 0.3444, + "num_tokens": 836667359.0, + "step": 1096 + }, + { + "epoch": 1.4970714689597047, + "grad_norm": 0.19318856311555563, + "learning_rate": 1.88628860697795e-05, + "loss": 0.3484, + "num_tokens": 837465077.0, + "step": 1097 + }, + { + "epoch": 1.4984370499183852, + "grad_norm": 0.22536676182141221, + "learning_rate": 1.8860466303004967e-05, + "loss": 0.3413, + "num_tokens": 838278586.0, + "step": 1098 + }, + { + "epoch": 1.4998026308770658, + "grad_norm": 0.19910609890134676, + "learning_rate": 1.885804413834258e-05, + "loss": 0.346, + "num_tokens": 839039986.0, + "step": 1099 + }, + { + "epoch": 1.5011682118357461, + "grad_norm": 0.2139337796925142, + "learning_rate": 1.8855619576531225e-05, + "loss": 0.3523, + "num_tokens": 839892474.0, + "step": 1100 + }, + { + "epoch": 1.5025337927944267, + "grad_norm": 0.18923142678416963, + "learning_rate": 1.8853192618310537e-05, + "loss": 0.3508, + "num_tokens": 840661420.0, + "step": 1101 + }, + { + "epoch": 1.5038993737531072, + "grad_norm": 0.23337713830177587, + "learning_rate": 1.885076326442088e-05, + "loss": 0.3359, + "num_tokens": 841366700.0, + "step": 1102 + }, + { + "epoch": 1.5052649547117878, + "grad_norm": 0.22920671166483242, + "learning_rate": 1.8848331515603326e-05, + "loss": 0.3474, + "num_tokens": 842157945.0, + "step": 1103 + }, + { + "epoch": 1.5066305356704683, + "grad_norm": 0.18249487649221832, + "learning_rate": 1.8845897372599718e-05, + "loss": 0.3372, + "num_tokens": 842903525.0, + "step": 1104 + }, + { + "epoch": 1.507996116629149, + "grad_norm": 0.21917797777206327, + "learning_rate": 1.8843460836152596e-05, + "loss": 0.3383, + "num_tokens": 843642274.0, + "step": 1105 + }, + { + "epoch": 1.5093616975878292, + "grad_norm": 0.21165685698432415, + "learning_rate": 1.8841021907005244e-05, + "loss": 0.3447, + "num_tokens": 844456405.0, + "step": 1106 + }, + { + "epoch": 1.5107272785465098, + "grad_norm": 0.18257209608361372, + "learning_rate": 1.8838580585901673e-05, + "loss": 0.3531, + "num_tokens": 845222915.0, + "step": 1107 + }, + { + "epoch": 1.5120928595051901, + "grad_norm": 0.20182479734907893, + "learning_rate": 1.8836136873586626e-05, + "loss": 0.3335, + "num_tokens": 846018937.0, + "step": 1108 + }, + { + "epoch": 1.5134584404638707, + "grad_norm": 0.2029057512845665, + "learning_rate": 1.8833690770805577e-05, + "loss": 0.3553, + "num_tokens": 846726028.0, + "step": 1109 + }, + { + "epoch": 1.5148240214225512, + "grad_norm": 0.1944718838078091, + "learning_rate": 1.883124227830472e-05, + "loss": 0.3481, + "num_tokens": 847427544.0, + "step": 1110 + }, + { + "epoch": 1.5161896023812318, + "grad_norm": 0.21190164218005594, + "learning_rate": 1.8828791396830986e-05, + "loss": 0.3376, + "num_tokens": 848120260.0, + "step": 1111 + }, + { + "epoch": 1.5175551833399124, + "grad_norm": 0.2178713649665666, + "learning_rate": 1.882633812713203e-05, + "loss": 0.3429, + "num_tokens": 848934638.0, + "step": 1112 + }, + { + "epoch": 1.518920764298593, + "grad_norm": 0.20334855244097014, + "learning_rate": 1.8823882469956246e-05, + "loss": 0.3467, + "num_tokens": 849700977.0, + "step": 1113 + }, + { + "epoch": 1.5202863452572735, + "grad_norm": 0.1994604802778267, + "learning_rate": 1.8821424426052742e-05, + "loss": 0.3426, + "num_tokens": 850492645.0, + "step": 1114 + }, + { + "epoch": 1.5216519262159538, + "grad_norm": 0.19556196442663648, + "learning_rate": 1.881896399617136e-05, + "loss": 0.3519, + "num_tokens": 851235954.0, + "step": 1115 + }, + { + "epoch": 1.5230175071746344, + "grad_norm": 0.1928520166616452, + "learning_rate": 1.8816501181062675e-05, + "loss": 0.3491, + "num_tokens": 852020802.0, + "step": 1116 + }, + { + "epoch": 1.5243830881333147, + "grad_norm": 0.20890518102693073, + "learning_rate": 1.8814035981477986e-05, + "loss": 0.3475, + "num_tokens": 852794546.0, + "step": 1117 + }, + { + "epoch": 1.5257486690919952, + "grad_norm": 0.20233755448265625, + "learning_rate": 1.8811568398169315e-05, + "loss": 0.3462, + "num_tokens": 853507662.0, + "step": 1118 + }, + { + "epoch": 1.5271142500506758, + "grad_norm": 0.22472413730464402, + "learning_rate": 1.880909843188941e-05, + "loss": 0.351, + "num_tokens": 854281205.0, + "step": 1119 + }, + { + "epoch": 1.5284798310093564, + "grad_norm": 0.19290522461879728, + "learning_rate": 1.8806626083391764e-05, + "loss": 0.3316, + "num_tokens": 854976425.0, + "step": 1120 + }, + { + "epoch": 1.529845411968037, + "grad_norm": 0.20610572963560608, + "learning_rate": 1.8804151353430577e-05, + "loss": 0.3485, + "num_tokens": 855758022.0, + "step": 1121 + }, + { + "epoch": 1.5312109929267175, + "grad_norm": 0.19095717657589478, + "learning_rate": 1.8801674242760773e-05, + "loss": 0.3547, + "num_tokens": 856591959.0, + "step": 1122 + }, + { + "epoch": 1.532576573885398, + "grad_norm": 0.20434419682633065, + "learning_rate": 1.8799194752138028e-05, + "loss": 0.3486, + "num_tokens": 857389990.0, + "step": 1123 + }, + { + "epoch": 1.5339421548440784, + "grad_norm": 0.19653369565495807, + "learning_rate": 1.8796712882318716e-05, + "loss": 0.3311, + "num_tokens": 858127252.0, + "step": 1124 + }, + { + "epoch": 1.535307735802759, + "grad_norm": 0.2053778040183756, + "learning_rate": 1.879422863405995e-05, + "loss": 0.363, + "num_tokens": 858900327.0, + "step": 1125 + }, + { + "epoch": 1.5366733167614393, + "grad_norm": 0.20878497674557545, + "learning_rate": 1.8791742008119566e-05, + "loss": 0.3368, + "num_tokens": 859666194.0, + "step": 1126 + }, + { + "epoch": 1.5380388977201198, + "grad_norm": 0.19527098390817324, + "learning_rate": 1.878925300525613e-05, + "loss": 0.3621, + "num_tokens": 860532261.0, + "step": 1127 + }, + { + "epoch": 1.5394044786788004, + "grad_norm": 0.1962551433692768, + "learning_rate": 1.878676162622892e-05, + "loss": 0.336, + "num_tokens": 861225131.0, + "step": 1128 + }, + { + "epoch": 1.540770059637481, + "grad_norm": 0.20587609452158792, + "learning_rate": 1.878426787179796e-05, + "loss": 0.3386, + "num_tokens": 861989442.0, + "step": 1129 + }, + { + "epoch": 1.5421356405961615, + "grad_norm": 0.21105967587191177, + "learning_rate": 1.8781771742723978e-05, + "loss": 0.3561, + "num_tokens": 862774724.0, + "step": 1130 + }, + { + "epoch": 1.543501221554842, + "grad_norm": 0.19049136855894255, + "learning_rate": 1.8779273239768435e-05, + "loss": 0.3347, + "num_tokens": 863576784.0, + "step": 1131 + }, + { + "epoch": 1.5448668025135226, + "grad_norm": 0.1886926352982969, + "learning_rate": 1.8776772363693524e-05, + "loss": 0.3458, + "num_tokens": 864374330.0, + "step": 1132 + }, + { + "epoch": 1.546232383472203, + "grad_norm": 0.1998432914446766, + "learning_rate": 1.8774269115262142e-05, + "loss": 0.3523, + "num_tokens": 865134067.0, + "step": 1133 + }, + { + "epoch": 1.5475979644308835, + "grad_norm": 0.19117886297509182, + "learning_rate": 1.8771763495237932e-05, + "loss": 0.3419, + "num_tokens": 865912923.0, + "step": 1134 + }, + { + "epoch": 1.5489635453895638, + "grad_norm": 0.2204979353558171, + "learning_rate": 1.8769255504385244e-05, + "loss": 0.3534, + "num_tokens": 866644649.0, + "step": 1135 + }, + { + "epoch": 1.5503291263482444, + "grad_norm": 0.18700041818170185, + "learning_rate": 1.8766745143469157e-05, + "loss": 0.3677, + "num_tokens": 867427608.0, + "step": 1136 + }, + { + "epoch": 1.551694707306925, + "grad_norm": 0.1966972070122868, + "learning_rate": 1.8764232413255483e-05, + "loss": 0.3417, + "num_tokens": 868137649.0, + "step": 1137 + }, + { + "epoch": 1.5530602882656055, + "grad_norm": 0.200052173154243, + "learning_rate": 1.8761717314510733e-05, + "loss": 0.3666, + "num_tokens": 868969239.0, + "step": 1138 + }, + { + "epoch": 1.554425869224286, + "grad_norm": 0.19201935034043988, + "learning_rate": 1.8759199848002162e-05, + "loss": 0.3445, + "num_tokens": 869732977.0, + "step": 1139 + }, + { + "epoch": 1.5557914501829666, + "grad_norm": 0.18934592209977974, + "learning_rate": 1.8756680014497746e-05, + "loss": 0.3536, + "num_tokens": 870544496.0, + "step": 1140 + }, + { + "epoch": 1.5571570311416472, + "grad_norm": 0.17151486465293897, + "learning_rate": 1.875415781476617e-05, + "loss": 0.3348, + "num_tokens": 871386951.0, + "step": 1141 + }, + { + "epoch": 1.5585226121003275, + "grad_norm": 0.18729295378207766, + "learning_rate": 1.875163324957684e-05, + "loss": 0.3576, + "num_tokens": 872218398.0, + "step": 1142 + }, + { + "epoch": 1.559888193059008, + "grad_norm": 0.1896623817171892, + "learning_rate": 1.8749106319699907e-05, + "loss": 0.342, + "num_tokens": 872948013.0, + "step": 1143 + }, + { + "epoch": 1.5612537740176884, + "grad_norm": 0.18068651369781916, + "learning_rate": 1.874657702590622e-05, + "loss": 0.3334, + "num_tokens": 873685991.0, + "step": 1144 + }, + { + "epoch": 1.562619354976369, + "grad_norm": 0.1923847741040314, + "learning_rate": 1.8744045368967357e-05, + "loss": 0.3599, + "num_tokens": 874395948.0, + "step": 1145 + }, + { + "epoch": 1.5639849359350495, + "grad_norm": 0.2074784431283741, + "learning_rate": 1.874151134965562e-05, + "loss": 0.3605, + "num_tokens": 875178449.0, + "step": 1146 + }, + { + "epoch": 1.56535051689373, + "grad_norm": 0.19223932976945451, + "learning_rate": 1.8738974968744026e-05, + "loss": 0.3493, + "num_tokens": 875987020.0, + "step": 1147 + }, + { + "epoch": 1.5667160978524106, + "grad_norm": 0.20618517115820317, + "learning_rate": 1.8736436227006318e-05, + "loss": 0.3395, + "num_tokens": 876714590.0, + "step": 1148 + }, + { + "epoch": 1.5680816788110912, + "grad_norm": 0.18571391805661056, + "learning_rate": 1.8733895125216952e-05, + "loss": 0.3334, + "num_tokens": 877502963.0, + "step": 1149 + }, + { + "epoch": 1.5694472597697717, + "grad_norm": 0.19203583444164035, + "learning_rate": 1.8731351664151108e-05, + "loss": 0.3502, + "num_tokens": 878295248.0, + "step": 1150 + }, + { + "epoch": 1.570812840728452, + "grad_norm": 0.1962647150714445, + "learning_rate": 1.8728805844584686e-05, + "loss": 0.3364, + "num_tokens": 878948915.0, + "step": 1151 + }, + { + "epoch": 1.5721784216871326, + "grad_norm": 0.19577049825559556, + "learning_rate": 1.8726257667294314e-05, + "loss": 0.3569, + "num_tokens": 879733990.0, + "step": 1152 + }, + { + "epoch": 1.573544002645813, + "grad_norm": 0.1853618764564945, + "learning_rate": 1.872370713305732e-05, + "loss": 0.3534, + "num_tokens": 880509444.0, + "step": 1153 + }, + { + "epoch": 1.5749095836044935, + "grad_norm": 0.19611455524324298, + "learning_rate": 1.872115424265176e-05, + "loss": 0.343, + "num_tokens": 881299960.0, + "step": 1154 + }, + { + "epoch": 1.576275164563174, + "grad_norm": 0.18776152638683133, + "learning_rate": 1.871859899685642e-05, + "loss": 0.352, + "num_tokens": 882061339.0, + "step": 1155 + }, + { + "epoch": 1.5776407455218546, + "grad_norm": 0.20845681625621995, + "learning_rate": 1.871604139645079e-05, + "loss": 0.3644, + "num_tokens": 882765973.0, + "step": 1156 + }, + { + "epoch": 1.5790063264805352, + "grad_norm": 0.2269135163191663, + "learning_rate": 1.8713481442215086e-05, + "loss": 0.3638, + "num_tokens": 883570192.0, + "step": 1157 + }, + { + "epoch": 1.5803719074392157, + "grad_norm": 0.20735092918046732, + "learning_rate": 1.871091913493023e-05, + "loss": 0.3476, + "num_tokens": 884336114.0, + "step": 1158 + }, + { + "epoch": 1.5817374883978963, + "grad_norm": 0.22174595292888044, + "learning_rate": 1.870835447537788e-05, + "loss": 0.3656, + "num_tokens": 885059560.0, + "step": 1159 + }, + { + "epoch": 1.5831030693565766, + "grad_norm": 0.21263915398200806, + "learning_rate": 1.8705787464340403e-05, + "loss": 0.3529, + "num_tokens": 885846501.0, + "step": 1160 + }, + { + "epoch": 1.5844686503152572, + "grad_norm": 0.1917925369663659, + "learning_rate": 1.8703218102600877e-05, + "loss": 0.3301, + "num_tokens": 886561332.0, + "step": 1161 + }, + { + "epoch": 1.5858342312739375, + "grad_norm": 0.219807446828307, + "learning_rate": 1.8700646390943108e-05, + "loss": 0.3379, + "num_tokens": 887321622.0, + "step": 1162 + }, + { + "epoch": 1.587199812232618, + "grad_norm": 0.19917040847015707, + "learning_rate": 1.869807233015161e-05, + "loss": 0.3392, + "num_tokens": 888137265.0, + "step": 1163 + }, + { + "epoch": 1.5885653931912986, + "grad_norm": 0.19655964204785956, + "learning_rate": 1.869549592101162e-05, + "loss": 0.3393, + "num_tokens": 888917315.0, + "step": 1164 + }, + { + "epoch": 1.5899309741499792, + "grad_norm": 0.19800604857768825, + "learning_rate": 1.8692917164309086e-05, + "loss": 0.3437, + "num_tokens": 889663794.0, + "step": 1165 + }, + { + "epoch": 1.5912965551086597, + "grad_norm": 0.23491109315563793, + "learning_rate": 1.8690336060830682e-05, + "loss": 0.3451, + "num_tokens": 890416078.0, + "step": 1166 + }, + { + "epoch": 1.5926621360673403, + "grad_norm": 0.18949255260315612, + "learning_rate": 1.868775261136378e-05, + "loss": 0.345, + "num_tokens": 891292450.0, + "step": 1167 + }, + { + "epoch": 1.5940277170260209, + "grad_norm": 0.218979769704418, + "learning_rate": 1.8685166816696488e-05, + "loss": 0.3409, + "num_tokens": 892049989.0, + "step": 1168 + }, + { + "epoch": 1.5953932979847012, + "grad_norm": 0.19301382124159722, + "learning_rate": 1.8682578677617616e-05, + "loss": 0.333, + "num_tokens": 892786184.0, + "step": 1169 + }, + { + "epoch": 1.5967588789433818, + "grad_norm": 0.22371489773819192, + "learning_rate": 1.8679988194916692e-05, + "loss": 0.35, + "num_tokens": 893542058.0, + "step": 1170 + }, + { + "epoch": 1.598124459902062, + "grad_norm": 0.20017402667137996, + "learning_rate": 1.8677395369383965e-05, + "loss": 0.3373, + "num_tokens": 894276344.0, + "step": 1171 + }, + { + "epoch": 1.5994900408607426, + "grad_norm": 0.20623063190784222, + "learning_rate": 1.867480020181039e-05, + "loss": 0.3434, + "num_tokens": 895049576.0, + "step": 1172 + }, + { + "epoch": 1.6008556218194232, + "grad_norm": 0.19802655498706712, + "learning_rate": 1.867220269298764e-05, + "loss": 0.347, + "num_tokens": 895841118.0, + "step": 1173 + }, + { + "epoch": 1.6022212027781038, + "grad_norm": 0.23050160751605783, + "learning_rate": 1.86696028437081e-05, + "loss": 0.3484, + "num_tokens": 896636430.0, + "step": 1174 + }, + { + "epoch": 1.6035867837367843, + "grad_norm": 0.19769972154016946, + "learning_rate": 1.8667000654764877e-05, + "loss": 0.3414, + "num_tokens": 897414539.0, + "step": 1175 + }, + { + "epoch": 1.6049523646954649, + "grad_norm": 0.2094126137513376, + "learning_rate": 1.8664396126951786e-05, + "loss": 0.3391, + "num_tokens": 898158005.0, + "step": 1176 + }, + { + "epoch": 1.6063179456541454, + "grad_norm": 0.19675319801751903, + "learning_rate": 1.8661789261063352e-05, + "loss": 0.3446, + "num_tokens": 898883706.0, + "step": 1177 + }, + { + "epoch": 1.6076835266128258, + "grad_norm": 0.20624693602336422, + "learning_rate": 1.8659180057894818e-05, + "loss": 0.3547, + "num_tokens": 899662132.0, + "step": 1178 + }, + { + "epoch": 1.6090491075715063, + "grad_norm": 0.19723105488806744, + "learning_rate": 1.8656568518242136e-05, + "loss": 0.338, + "num_tokens": 900428985.0, + "step": 1179 + }, + { + "epoch": 1.6104146885301867, + "grad_norm": 0.19576464912065347, + "learning_rate": 1.8653954642901983e-05, + "loss": 0.3504, + "num_tokens": 901132358.0, + "step": 1180 + }, + { + "epoch": 1.6117802694888672, + "grad_norm": 0.21681173104870213, + "learning_rate": 1.865133843267173e-05, + "loss": 0.3478, + "num_tokens": 901834067.0, + "step": 1181 + }, + { + "epoch": 1.6131458504475478, + "grad_norm": 0.19698628618101263, + "learning_rate": 1.8648719888349475e-05, + "loss": 0.3442, + "num_tokens": 902657804.0, + "step": 1182 + }, + { + "epoch": 1.6145114314062283, + "grad_norm": 0.19499666438737037, + "learning_rate": 1.8646099010734016e-05, + "loss": 0.3415, + "num_tokens": 903405433.0, + "step": 1183 + }, + { + "epoch": 1.6158770123649089, + "grad_norm": 0.18206617724791047, + "learning_rate": 1.8643475800624873e-05, + "loss": 0.3631, + "num_tokens": 904184629.0, + "step": 1184 + }, + { + "epoch": 1.6172425933235894, + "grad_norm": 0.21362017683058335, + "learning_rate": 1.8640850258822277e-05, + "loss": 0.3344, + "num_tokens": 904955451.0, + "step": 1185 + }, + { + "epoch": 1.61860817428227, + "grad_norm": 0.20277690628614012, + "learning_rate": 1.8638222386127163e-05, + "loss": 0.3597, + "num_tokens": 905696793.0, + "step": 1186 + }, + { + "epoch": 1.6199737552409503, + "grad_norm": 0.19441641525766237, + "learning_rate": 1.8635592183341182e-05, + "loss": 0.3296, + "num_tokens": 906438038.0, + "step": 1187 + }, + { + "epoch": 1.6213393361996309, + "grad_norm": 0.19556927058959284, + "learning_rate": 1.8632959651266696e-05, + "loss": 0.3518, + "num_tokens": 907270607.0, + "step": 1188 + }, + { + "epoch": 1.6227049171583112, + "grad_norm": 0.2003546239873265, + "learning_rate": 1.8630324790706777e-05, + "loss": 0.3504, + "num_tokens": 908090745.0, + "step": 1189 + }, + { + "epoch": 1.6240704981169918, + "grad_norm": 0.18581905697668577, + "learning_rate": 1.86276876024652e-05, + "loss": 0.3448, + "num_tokens": 908865997.0, + "step": 1190 + }, + { + "epoch": 1.6254360790756723, + "grad_norm": 0.20054550306610225, + "learning_rate": 1.8625048087346468e-05, + "loss": 0.3525, + "num_tokens": 909640165.0, + "step": 1191 + }, + { + "epoch": 1.626801660034353, + "grad_norm": 0.2173186201759512, + "learning_rate": 1.8622406246155777e-05, + "loss": 0.3379, + "num_tokens": 910419507.0, + "step": 1192 + }, + { + "epoch": 1.6281672409930334, + "grad_norm": 0.1904839672245517, + "learning_rate": 1.8619762079699038e-05, + "loss": 0.3401, + "num_tokens": 911224684.0, + "step": 1193 + }, + { + "epoch": 1.629532821951714, + "grad_norm": 0.19432253490254348, + "learning_rate": 1.861711558878287e-05, + "loss": 0.3333, + "num_tokens": 911996270.0, + "step": 1194 + }, + { + "epoch": 1.6308984029103946, + "grad_norm": 0.19323334654918106, + "learning_rate": 1.8614466774214605e-05, + "loss": 0.3511, + "num_tokens": 912769798.0, + "step": 1195 + }, + { + "epoch": 1.632263983869075, + "grad_norm": 0.19598720178477153, + "learning_rate": 1.8611815636802285e-05, + "loss": 0.3501, + "num_tokens": 913646571.0, + "step": 1196 + }, + { + "epoch": 1.6336295648277555, + "grad_norm": 0.20935599741357566, + "learning_rate": 1.8609162177354653e-05, + "loss": 0.3516, + "num_tokens": 914384512.0, + "step": 1197 + }, + { + "epoch": 1.6349951457864358, + "grad_norm": 0.1988318709329742, + "learning_rate": 1.8606506396681164e-05, + "loss": 0.3346, + "num_tokens": 915134562.0, + "step": 1198 + }, + { + "epoch": 1.6363607267451163, + "grad_norm": 0.2106000854464148, + "learning_rate": 1.8603848295591985e-05, + "loss": 0.3553, + "num_tokens": 915937181.0, + "step": 1199 + }, + { + "epoch": 1.637726307703797, + "grad_norm": 0.19687573918019438, + "learning_rate": 1.8601187874897985e-05, + "loss": 0.3342, + "num_tokens": 916696280.0, + "step": 1200 + }, + { + "epoch": 1.6390918886624775, + "grad_norm": 0.21474057369610397, + "learning_rate": 1.8598525135410742e-05, + "loss": 0.33, + "num_tokens": 917412503.0, + "step": 1201 + }, + { + "epoch": 1.640457469621158, + "grad_norm": 0.20016869110189686, + "learning_rate": 1.8595860077942545e-05, + "loss": 0.3382, + "num_tokens": 918117121.0, + "step": 1202 + }, + { + "epoch": 1.6418230505798386, + "grad_norm": 0.19995265187043457, + "learning_rate": 1.859319270330639e-05, + "loss": 0.354, + "num_tokens": 918937113.0, + "step": 1203 + }, + { + "epoch": 1.6431886315385191, + "grad_norm": 0.21276174984042134, + "learning_rate": 1.859052301231597e-05, + "loss": 0.3425, + "num_tokens": 919612274.0, + "step": 1204 + }, + { + "epoch": 1.6445542124971995, + "grad_norm": 0.18200229864099504, + "learning_rate": 1.85878510057857e-05, + "loss": 0.3302, + "num_tokens": 920432589.0, + "step": 1205 + }, + { + "epoch": 1.64591979345588, + "grad_norm": 0.1936318122635255, + "learning_rate": 1.858517668453069e-05, + "loss": 0.3454, + "num_tokens": 921231719.0, + "step": 1206 + }, + { + "epoch": 1.6472853744145604, + "grad_norm": 0.21888372027519343, + "learning_rate": 1.8582500049366758e-05, + "loss": 0.3468, + "num_tokens": 921997319.0, + "step": 1207 + }, + { + "epoch": 1.648650955373241, + "grad_norm": 0.20906338645847447, + "learning_rate": 1.8579821101110433e-05, + "loss": 0.3403, + "num_tokens": 922727822.0, + "step": 1208 + }, + { + "epoch": 1.6500165363319215, + "grad_norm": 0.19551370107277014, + "learning_rate": 1.8577139840578945e-05, + "loss": 0.3501, + "num_tokens": 923479628.0, + "step": 1209 + }, + { + "epoch": 1.651382117290602, + "grad_norm": 0.20838917784372538, + "learning_rate": 1.8574456268590227e-05, + "loss": 0.3428, + "num_tokens": 924180362.0, + "step": 1210 + }, + { + "epoch": 1.6527476982492826, + "grad_norm": 0.20059919642802865, + "learning_rate": 1.8571770385962924e-05, + "loss": 0.3416, + "num_tokens": 925028872.0, + "step": 1211 + }, + { + "epoch": 1.6541132792079631, + "grad_norm": 0.18834584130405413, + "learning_rate": 1.856908219351638e-05, + "loss": 0.3373, + "num_tokens": 925812614.0, + "step": 1212 + }, + { + "epoch": 1.6554788601666437, + "grad_norm": 0.1846061237520437, + "learning_rate": 1.856639169207065e-05, + "loss": 0.3509, + "num_tokens": 926595468.0, + "step": 1213 + }, + { + "epoch": 1.656844441125324, + "grad_norm": 0.19717897134061352, + "learning_rate": 1.8563698882446484e-05, + "loss": 0.339, + "num_tokens": 927392118.0, + "step": 1214 + }, + { + "epoch": 1.6582100220840046, + "grad_norm": 0.19561888630466406, + "learning_rate": 1.856100376546535e-05, + "loss": 0.3412, + "num_tokens": 928157989.0, + "step": 1215 + }, + { + "epoch": 1.659575603042685, + "grad_norm": 0.21227557027489777, + "learning_rate": 1.8558306341949404e-05, + "loss": 0.3216, + "num_tokens": 928846657.0, + "step": 1216 + }, + { + "epoch": 1.6609411840013655, + "grad_norm": 0.17923808369042452, + "learning_rate": 1.855560661272151e-05, + "loss": 0.3411, + "num_tokens": 929638962.0, + "step": 1217 + }, + { + "epoch": 1.662306764960046, + "grad_norm": 0.20858444491270087, + "learning_rate": 1.8552904578605256e-05, + "loss": 0.3526, + "num_tokens": 930366074.0, + "step": 1218 + }, + { + "epoch": 1.6636723459187266, + "grad_norm": 0.20233809861716978, + "learning_rate": 1.8550200240424895e-05, + "loss": 0.336, + "num_tokens": 931085956.0, + "step": 1219 + }, + { + "epoch": 1.6650379268774071, + "grad_norm": 0.19060408051362424, + "learning_rate": 1.8547493599005416e-05, + "loss": 0.3306, + "num_tokens": 931840275.0, + "step": 1220 + }, + { + "epoch": 1.6664035078360877, + "grad_norm": 0.19741307256773502, + "learning_rate": 1.854478465517249e-05, + "loss": 0.3429, + "num_tokens": 932575611.0, + "step": 1221 + }, + { + "epoch": 1.6677690887947683, + "grad_norm": 0.19940823552079082, + "learning_rate": 1.8542073409752506e-05, + "loss": 0.3524, + "num_tokens": 933317306.0, + "step": 1222 + }, + { + "epoch": 1.6691346697534486, + "grad_norm": 0.1929246208801896, + "learning_rate": 1.8539359863572545e-05, + "loss": 0.34, + "num_tokens": 934017808.0, + "step": 1223 + }, + { + "epoch": 1.6705002507121292, + "grad_norm": 0.19122182552525324, + "learning_rate": 1.8536644017460387e-05, + "loss": 0.3408, + "num_tokens": 934747849.0, + "step": 1224 + }, + { + "epoch": 1.6718658316708095, + "grad_norm": 0.19270003898465832, + "learning_rate": 1.8533925872244524e-05, + "loss": 0.3219, + "num_tokens": 935398512.0, + "step": 1225 + }, + { + "epoch": 1.67323141262949, + "grad_norm": 0.20010170059158036, + "learning_rate": 1.8531205428754146e-05, + "loss": 0.3459, + "num_tokens": 936201755.0, + "step": 1226 + }, + { + "epoch": 1.6745969935881706, + "grad_norm": 0.17470186634204232, + "learning_rate": 1.852848268781914e-05, + "loss": 0.3401, + "num_tokens": 937026108.0, + "step": 1227 + }, + { + "epoch": 1.6759625745468512, + "grad_norm": 0.1987255591481813, + "learning_rate": 1.85257576502701e-05, + "loss": 0.3459, + "num_tokens": 937830227.0, + "step": 1228 + }, + { + "epoch": 1.6773281555055317, + "grad_norm": 0.1966232952097899, + "learning_rate": 1.852303031693831e-05, + "loss": 0.3348, + "num_tokens": 938602809.0, + "step": 1229 + }, + { + "epoch": 1.6786937364642123, + "grad_norm": 0.18811364351747842, + "learning_rate": 1.8520300688655763e-05, + "loss": 0.334, + "num_tokens": 939307550.0, + "step": 1230 + }, + { + "epoch": 1.6800593174228928, + "grad_norm": 0.19337673039104525, + "learning_rate": 1.851756876625516e-05, + "loss": 0.3377, + "num_tokens": 940033170.0, + "step": 1231 + }, + { + "epoch": 1.6814248983815732, + "grad_norm": 0.19144029715453328, + "learning_rate": 1.8514834550569878e-05, + "loss": 0.3499, + "num_tokens": 940818366.0, + "step": 1232 + }, + { + "epoch": 1.6827904793402537, + "grad_norm": 0.21169326485977855, + "learning_rate": 1.8512098042434016e-05, + "loss": 0.3531, + "num_tokens": 941562733.0, + "step": 1233 + }, + { + "epoch": 1.684156060298934, + "grad_norm": 0.20753901874145128, + "learning_rate": 1.8509359242682363e-05, + "loss": 0.3315, + "num_tokens": 942309901.0, + "step": 1234 + }, + { + "epoch": 1.6855216412576146, + "grad_norm": 0.19462920101168749, + "learning_rate": 1.850661815215041e-05, + "loss": 0.3619, + "num_tokens": 943082992.0, + "step": 1235 + }, + { + "epoch": 1.6868872222162952, + "grad_norm": 0.1962633877018682, + "learning_rate": 1.850387477167434e-05, + "loss": 0.3356, + "num_tokens": 943902619.0, + "step": 1236 + }, + { + "epoch": 1.6882528031749757, + "grad_norm": 0.1966731771790236, + "learning_rate": 1.8501129102091042e-05, + "loss": 0.3442, + "num_tokens": 944742807.0, + "step": 1237 + }, + { + "epoch": 1.6896183841336563, + "grad_norm": 0.19049572605099718, + "learning_rate": 1.84983811442381e-05, + "loss": 0.3274, + "num_tokens": 945491535.0, + "step": 1238 + }, + { + "epoch": 1.6909839650923368, + "grad_norm": 0.1939506173896781, + "learning_rate": 1.8495630898953802e-05, + "loss": 0.3414, + "num_tokens": 946259346.0, + "step": 1239 + }, + { + "epoch": 1.6923495460510174, + "grad_norm": 0.1796586678448362, + "learning_rate": 1.8492878367077124e-05, + "loss": 0.3552, + "num_tokens": 947042346.0, + "step": 1240 + }, + { + "epoch": 1.6937151270096977, + "grad_norm": 0.2100435951056057, + "learning_rate": 1.8490123549447742e-05, + "loss": 0.325, + "num_tokens": 947742050.0, + "step": 1241 + }, + { + "epoch": 1.6950807079683783, + "grad_norm": 0.18245678796797613, + "learning_rate": 1.8487366446906037e-05, + "loss": 0.3454, + "num_tokens": 948460727.0, + "step": 1242 + }, + { + "epoch": 1.6964462889270586, + "grad_norm": 0.19530374892053062, + "learning_rate": 1.8484607060293084e-05, + "loss": 0.3379, + "num_tokens": 949212962.0, + "step": 1243 + }, + { + "epoch": 1.6978118698857392, + "grad_norm": 0.20051938910068431, + "learning_rate": 1.848184539045064e-05, + "loss": 0.3483, + "num_tokens": 950015170.0, + "step": 1244 + }, + { + "epoch": 1.6991774508444197, + "grad_norm": 0.19544817532994013, + "learning_rate": 1.847908143822118e-05, + "loss": 0.341, + "num_tokens": 950789340.0, + "step": 1245 + }, + { + "epoch": 1.7005430318031003, + "grad_norm": 0.18588948205135084, + "learning_rate": 1.8476315204447867e-05, + "loss": 0.3502, + "num_tokens": 951538216.0, + "step": 1246 + }, + { + "epoch": 1.7019086127617808, + "grad_norm": 0.19265518491528197, + "learning_rate": 1.8473546689974558e-05, + "loss": 0.3482, + "num_tokens": 952324531.0, + "step": 1247 + }, + { + "epoch": 1.7032741937204614, + "grad_norm": 0.18357383373215874, + "learning_rate": 1.8470775895645802e-05, + "loss": 0.3503, + "num_tokens": 953075087.0, + "step": 1248 + }, + { + "epoch": 1.704639774679142, + "grad_norm": 0.19348833324227807, + "learning_rate": 1.8468002822306855e-05, + "loss": 0.3334, + "num_tokens": 953891104.0, + "step": 1249 + }, + { + "epoch": 1.7060053556378223, + "grad_norm": 0.1946291181658379, + "learning_rate": 1.8465227470803658e-05, + "loss": 0.3453, + "num_tokens": 954650868.0, + "step": 1250 + }, + { + "epoch": 1.7073709365965029, + "grad_norm": 0.21991654041949513, + "learning_rate": 1.846244984198285e-05, + "loss": 0.3366, + "num_tokens": 955350949.0, + "step": 1251 + }, + { + "epoch": 1.7087365175551832, + "grad_norm": 0.18888806066759609, + "learning_rate": 1.8459669936691768e-05, + "loss": 0.3553, + "num_tokens": 956108107.0, + "step": 1252 + }, + { + "epoch": 1.7101020985138637, + "grad_norm": 0.20173391709949873, + "learning_rate": 1.8456887755778436e-05, + "loss": 0.3248, + "num_tokens": 956824755.0, + "step": 1253 + }, + { + "epoch": 1.7114676794725443, + "grad_norm": 0.2177548548134337, + "learning_rate": 1.8454103300091584e-05, + "loss": 0.3414, + "num_tokens": 957591891.0, + "step": 1254 + }, + { + "epoch": 1.7128332604312249, + "grad_norm": 0.19958437720386601, + "learning_rate": 1.8451316570480625e-05, + "loss": 0.3422, + "num_tokens": 958328498.0, + "step": 1255 + }, + { + "epoch": 1.7141988413899054, + "grad_norm": 0.2626473427553063, + "learning_rate": 1.8448527567795667e-05, + "loss": 0.3177, + "num_tokens": 959035921.0, + "step": 1256 + }, + { + "epoch": 1.715564422348586, + "grad_norm": 0.19790715994422373, + "learning_rate": 1.8445736292887517e-05, + "loss": 0.3475, + "num_tokens": 959763911.0, + "step": 1257 + }, + { + "epoch": 1.7169300033072665, + "grad_norm": 0.2096717432104978, + "learning_rate": 1.844294274660767e-05, + "loss": 0.35, + "num_tokens": 960581633.0, + "step": 1258 + }, + { + "epoch": 1.7182955842659469, + "grad_norm": 0.20603013298951786, + "learning_rate": 1.8440146929808323e-05, + "loss": 0.3329, + "num_tokens": 961311201.0, + "step": 1259 + }, + { + "epoch": 1.7196611652246274, + "grad_norm": 0.2140267008877135, + "learning_rate": 1.8437348843342345e-05, + "loss": 0.3421, + "num_tokens": 962135110.0, + "step": 1260 + }, + { + "epoch": 1.7210267461833078, + "grad_norm": 0.2206616602340336, + "learning_rate": 1.8434548488063324e-05, + "loss": 0.3322, + "num_tokens": 962877982.0, + "step": 1261 + }, + { + "epoch": 1.7223923271419883, + "grad_norm": 0.18472855804602945, + "learning_rate": 1.8431745864825523e-05, + "loss": 0.3513, + "num_tokens": 963664778.0, + "step": 1262 + }, + { + "epoch": 1.7237579081006689, + "grad_norm": 0.22151923854615338, + "learning_rate": 1.8428940974483896e-05, + "loss": 0.3406, + "num_tokens": 964383183.0, + "step": 1263 + }, + { + "epoch": 1.7251234890593494, + "grad_norm": 0.20398157824017815, + "learning_rate": 1.8426133817894102e-05, + "loss": 0.3356, + "num_tokens": 965127212.0, + "step": 1264 + }, + { + "epoch": 1.72648907001803, + "grad_norm": 0.19879527163078042, + "learning_rate": 1.842332439591248e-05, + "loss": 0.3498, + "num_tokens": 965943491.0, + "step": 1265 + }, + { + "epoch": 1.7278546509767105, + "grad_norm": 0.22992745736348894, + "learning_rate": 1.8420512709396057e-05, + "loss": 0.3363, + "num_tokens": 966704554.0, + "step": 1266 + }, + { + "epoch": 1.729220231935391, + "grad_norm": 0.23017025600342914, + "learning_rate": 1.8417698759202562e-05, + "loss": 0.3247, + "num_tokens": 967443941.0, + "step": 1267 + }, + { + "epoch": 1.7305858128940714, + "grad_norm": 0.20516326444145594, + "learning_rate": 1.841488254619041e-05, + "loss": 0.3235, + "num_tokens": 968170723.0, + "step": 1268 + }, + { + "epoch": 1.731951393852752, + "grad_norm": 0.18490606870064655, + "learning_rate": 1.8412064071218704e-05, + "loss": 0.334, + "num_tokens": 968917602.0, + "step": 1269 + }, + { + "epoch": 1.7333169748114323, + "grad_norm": 0.22526472218814042, + "learning_rate": 1.8409243335147238e-05, + "loss": 0.335, + "num_tokens": 969663786.0, + "step": 1270 + }, + { + "epoch": 1.7346825557701129, + "grad_norm": 0.19693186744229033, + "learning_rate": 1.8406420338836498e-05, + "loss": 0.3425, + "num_tokens": 970455867.0, + "step": 1271 + }, + { + "epoch": 1.7360481367287934, + "grad_norm": 0.20032020526254093, + "learning_rate": 1.8403595083147662e-05, + "loss": 0.3644, + "num_tokens": 971248699.0, + "step": 1272 + }, + { + "epoch": 1.737413717687474, + "grad_norm": 0.219464872986859, + "learning_rate": 1.8400767568942586e-05, + "loss": 0.3449, + "num_tokens": 972016614.0, + "step": 1273 + }, + { + "epoch": 1.7387792986461545, + "grad_norm": 0.23308035470839664, + "learning_rate": 1.839793779708382e-05, + "loss": 0.3468, + "num_tokens": 972771512.0, + "step": 1274 + }, + { + "epoch": 1.740144879604835, + "grad_norm": 0.190929961619599, + "learning_rate": 1.8395105768434615e-05, + "loss": 0.3593, + "num_tokens": 973502735.0, + "step": 1275 + }, + { + "epoch": 1.7415104605635157, + "grad_norm": 0.21583161687324673, + "learning_rate": 1.839227148385889e-05, + "loss": 0.3552, + "num_tokens": 974196212.0, + "step": 1276 + }, + { + "epoch": 1.742876041522196, + "grad_norm": 0.19011482578637956, + "learning_rate": 1.8389434944221274e-05, + "loss": 0.3399, + "num_tokens": 974944799.0, + "step": 1277 + }, + { + "epoch": 1.7442416224808766, + "grad_norm": 0.19671668021681205, + "learning_rate": 1.838659615038706e-05, + "loss": 0.3359, + "num_tokens": 975748225.0, + "step": 1278 + }, + { + "epoch": 1.7456072034395569, + "grad_norm": 0.19649941611618246, + "learning_rate": 1.838375510322225e-05, + "loss": 0.3509, + "num_tokens": 976552779.0, + "step": 1279 + }, + { + "epoch": 1.7469727843982374, + "grad_norm": 0.20769773590433582, + "learning_rate": 1.838091180359352e-05, + "loss": 0.3454, + "num_tokens": 977323948.0, + "step": 1280 + }, + { + "epoch": 1.748338365356918, + "grad_norm": 0.19572746951340345, + "learning_rate": 1.837806625236824e-05, + "loss": 0.3338, + "num_tokens": 978154226.0, + "step": 1281 + }, + { + "epoch": 1.7497039463155986, + "grad_norm": 0.19465774262198837, + "learning_rate": 1.837521845041446e-05, + "loss": 0.339, + "num_tokens": 978900281.0, + "step": 1282 + }, + { + "epoch": 1.7510695272742791, + "grad_norm": 0.20263752204903554, + "learning_rate": 1.8372368398600928e-05, + "loss": 0.3469, + "num_tokens": 979717687.0, + "step": 1283 + }, + { + "epoch": 1.7524351082329597, + "grad_norm": 0.1874261607761662, + "learning_rate": 1.8369516097797067e-05, + "loss": 0.3441, + "num_tokens": 980482708.0, + "step": 1284 + }, + { + "epoch": 1.7538006891916402, + "grad_norm": 0.19583140909256944, + "learning_rate": 1.836666154887299e-05, + "loss": 0.3426, + "num_tokens": 981324398.0, + "step": 1285 + }, + { + "epoch": 1.7551662701503206, + "grad_norm": 0.20071873178609426, + "learning_rate": 1.8363804752699497e-05, + "loss": 0.3538, + "num_tokens": 982189022.0, + "step": 1286 + }, + { + "epoch": 1.7565318511090011, + "grad_norm": 0.1897794560492021, + "learning_rate": 1.836094571014807e-05, + "loss": 0.3445, + "num_tokens": 982907827.0, + "step": 1287 + }, + { + "epoch": 1.7578974320676815, + "grad_norm": 0.19465828086602127, + "learning_rate": 1.835808442209089e-05, + "loss": 0.3511, + "num_tokens": 983702106.0, + "step": 1288 + }, + { + "epoch": 1.759263013026362, + "grad_norm": 0.20100937394293544, + "learning_rate": 1.83552208894008e-05, + "loss": 0.3408, + "num_tokens": 984378865.0, + "step": 1289 + }, + { + "epoch": 1.7606285939850426, + "grad_norm": 0.20058606169969928, + "learning_rate": 1.8352355112951346e-05, + "loss": 0.3619, + "num_tokens": 985168260.0, + "step": 1290 + }, + { + "epoch": 1.7619941749437231, + "grad_norm": 0.20132072052026914, + "learning_rate": 1.834948709361675e-05, + "loss": 0.3407, + "num_tokens": 985833549.0, + "step": 1291 + }, + { + "epoch": 1.7633597559024037, + "grad_norm": 0.2087711431944956, + "learning_rate": 1.8346616832271923e-05, + "loss": 0.3531, + "num_tokens": 986548571.0, + "step": 1292 + }, + { + "epoch": 1.7647253368610842, + "grad_norm": 0.18843587840261322, + "learning_rate": 1.834374432979245e-05, + "loss": 0.3435, + "num_tokens": 987277391.0, + "step": 1293 + }, + { + "epoch": 1.7660909178197648, + "grad_norm": 0.1899610564719992, + "learning_rate": 1.8340869587054617e-05, + "loss": 0.3321, + "num_tokens": 988038211.0, + "step": 1294 + }, + { + "epoch": 1.7674564987784451, + "grad_norm": 0.21043918629186983, + "learning_rate": 1.8337992604935374e-05, + "loss": 0.3537, + "num_tokens": 988843265.0, + "step": 1295 + }, + { + "epoch": 1.7688220797371257, + "grad_norm": 0.21348032466562758, + "learning_rate": 1.8335113384312372e-05, + "loss": 0.3485, + "num_tokens": 989739636.0, + "step": 1296 + }, + { + "epoch": 1.770187660695806, + "grad_norm": 0.18107832366735577, + "learning_rate": 1.8332231926063935e-05, + "loss": 0.3537, + "num_tokens": 990541518.0, + "step": 1297 + }, + { + "epoch": 1.7715532416544866, + "grad_norm": 0.21420034140432206, + "learning_rate": 1.832934823106907e-05, + "loss": 0.3336, + "num_tokens": 991288069.0, + "step": 1298 + }, + { + "epoch": 1.7729188226131671, + "grad_norm": 0.19758975862234557, + "learning_rate": 1.832646230020746e-05, + "loss": 0.3308, + "num_tokens": 991987588.0, + "step": 1299 + }, + { + "epoch": 1.7742844035718477, + "grad_norm": 0.4864055265374092, + "learning_rate": 1.832357413435949e-05, + "loss": 0.3276, + "num_tokens": 992744334.0, + "step": 1300 + }, + { + "epoch": 1.7756499845305282, + "grad_norm": 0.2371564291653087, + "learning_rate": 1.8320683734406208e-05, + "loss": 0.3679, + "num_tokens": 993497890.0, + "step": 1301 + }, + { + "epoch": 1.7770155654892088, + "grad_norm": 0.17379743788282073, + "learning_rate": 1.831779110122935e-05, + "loss": 0.3412, + "num_tokens": 994238417.0, + "step": 1302 + }, + { + "epoch": 1.7783811464478894, + "grad_norm": 0.23271419634641805, + "learning_rate": 1.831489623571133e-05, + "loss": 0.3325, + "num_tokens": 994963096.0, + "step": 1303 + }, + { + "epoch": 1.7797467274065697, + "grad_norm": 0.21479665618764432, + "learning_rate": 1.8311999138735256e-05, + "loss": 0.3362, + "num_tokens": 995640603.0, + "step": 1304 + }, + { + "epoch": 1.7811123083652503, + "grad_norm": 0.2082157870286439, + "learning_rate": 1.83090998111849e-05, + "loss": 0.3356, + "num_tokens": 996328265.0, + "step": 1305 + }, + { + "epoch": 1.7824778893239306, + "grad_norm": 0.21703633294899552, + "learning_rate": 1.830619825394472e-05, + "loss": 0.3453, + "num_tokens": 997089711.0, + "step": 1306 + }, + { + "epoch": 1.7838434702826111, + "grad_norm": 0.20984693627624557, + "learning_rate": 1.8303294467899853e-05, + "loss": 0.3483, + "num_tokens": 997803333.0, + "step": 1307 + }, + { + "epoch": 1.7852090512412917, + "grad_norm": 0.2023558569650378, + "learning_rate": 1.830038845393613e-05, + "loss": 0.3303, + "num_tokens": 998555301.0, + "step": 1308 + }, + { + "epoch": 1.7865746321999723, + "grad_norm": 0.189416726118961, + "learning_rate": 1.829748021294004e-05, + "loss": 0.3382, + "num_tokens": 999278977.0, + "step": 1309 + }, + { + "epoch": 1.7879402131586528, + "grad_norm": 0.20543509911004632, + "learning_rate": 1.829456974579876e-05, + "loss": 0.3334, + "num_tokens": 1000014672.0, + "step": 1310 + }, + { + "epoch": 1.7893057941173334, + "grad_norm": 0.18601645776267262, + "learning_rate": 1.8291657053400154e-05, + "loss": 0.3469, + "num_tokens": 1000741773.0, + "step": 1311 + }, + { + "epoch": 1.790671375076014, + "grad_norm": 0.21006896733825822, + "learning_rate": 1.8288742136632755e-05, + "loss": 0.3425, + "num_tokens": 1001494940.0, + "step": 1312 + }, + { + "epoch": 1.7920369560346943, + "grad_norm": 0.19269957329582907, + "learning_rate": 1.828582499638578e-05, + "loss": 0.3707, + "num_tokens": 1002252893.0, + "step": 1313 + }, + { + "epoch": 1.7934025369933748, + "grad_norm": 0.2062901253720772, + "learning_rate": 1.8282905633549122e-05, + "loss": 0.3333, + "num_tokens": 1002977777.0, + "step": 1314 + }, + { + "epoch": 1.7947681179520552, + "grad_norm": 0.2042451698660808, + "learning_rate": 1.8279984049013352e-05, + "loss": 0.3446, + "num_tokens": 1003740831.0, + "step": 1315 + }, + { + "epoch": 1.7961336989107357, + "grad_norm": 0.18134213958973186, + "learning_rate": 1.8277060243669714e-05, + "loss": 0.3451, + "num_tokens": 1004414611.0, + "step": 1316 + }, + { + "epoch": 1.7974992798694163, + "grad_norm": 0.19994069606619363, + "learning_rate": 1.827413421841014e-05, + "loss": 0.3505, + "num_tokens": 1005144139.0, + "step": 1317 + }, + { + "epoch": 1.7988648608280968, + "grad_norm": 0.19966448198429756, + "learning_rate": 1.827120597412723e-05, + "loss": 0.3372, + "num_tokens": 1005913575.0, + "step": 1318 + }, + { + "epoch": 1.8002304417867774, + "grad_norm": 0.19633939480787413, + "learning_rate": 1.826827551171427e-05, + "loss": 0.3519, + "num_tokens": 1006592989.0, + "step": 1319 + }, + { + "epoch": 1.801596022745458, + "grad_norm": 0.19740868950887808, + "learning_rate": 1.8265342832065215e-05, + "loss": 0.3342, + "num_tokens": 1007310808.0, + "step": 1320 + }, + { + "epoch": 1.8029616037041385, + "grad_norm": 0.1918995422147318, + "learning_rate": 1.8262407936074696e-05, + "loss": 0.3421, + "num_tokens": 1007965480.0, + "step": 1321 + }, + { + "epoch": 1.8043271846628188, + "grad_norm": 0.21099660611905727, + "learning_rate": 1.8259470824638018e-05, + "loss": 0.3342, + "num_tokens": 1008622295.0, + "step": 1322 + }, + { + "epoch": 1.8056927656214994, + "grad_norm": 0.18777179813749642, + "learning_rate": 1.8256531498651178e-05, + "loss": 0.3531, + "num_tokens": 1009364283.0, + "step": 1323 + }, + { + "epoch": 1.8070583465801797, + "grad_norm": 0.18459150264304755, + "learning_rate": 1.8253589959010832e-05, + "loss": 0.343, + "num_tokens": 1010141126.0, + "step": 1324 + }, + { + "epoch": 1.8084239275388603, + "grad_norm": 0.19433979208056637, + "learning_rate": 1.8250646206614314e-05, + "loss": 0.364, + "num_tokens": 1010884857.0, + "step": 1325 + }, + { + "epoch": 1.8097895084975408, + "grad_norm": 0.20302257404012433, + "learning_rate": 1.8247700242359635e-05, + "loss": 0.3375, + "num_tokens": 1011570362.0, + "step": 1326 + }, + { + "epoch": 1.8111550894562214, + "grad_norm": 0.18484656566466565, + "learning_rate": 1.8244752067145485e-05, + "loss": 0.3383, + "num_tokens": 1012365637.0, + "step": 1327 + }, + { + "epoch": 1.812520670414902, + "grad_norm": 0.17529474724343838, + "learning_rate": 1.8241801681871224e-05, + "loss": 0.3416, + "num_tokens": 1013157884.0, + "step": 1328 + }, + { + "epoch": 1.8138862513735825, + "grad_norm": 0.1804607033897169, + "learning_rate": 1.8238849087436887e-05, + "loss": 0.3358, + "num_tokens": 1013881776.0, + "step": 1329 + }, + { + "epoch": 1.815251832332263, + "grad_norm": 0.19716895443010413, + "learning_rate": 1.8235894284743176e-05, + "loss": 0.3474, + "num_tokens": 1014637454.0, + "step": 1330 + }, + { + "epoch": 1.8166174132909434, + "grad_norm": 0.19476566376415835, + "learning_rate": 1.823293727469148e-05, + "loss": 0.3329, + "num_tokens": 1015365856.0, + "step": 1331 + }, + { + "epoch": 1.817982994249624, + "grad_norm": 0.17409451945346133, + "learning_rate": 1.822997805818386e-05, + "loss": 0.3352, + "num_tokens": 1016121239.0, + "step": 1332 + }, + { + "epoch": 1.8193485752083043, + "grad_norm": 0.19787808590456965, + "learning_rate": 1.822701663612303e-05, + "loss": 0.351, + "num_tokens": 1016913224.0, + "step": 1333 + }, + { + "epoch": 1.8207141561669848, + "grad_norm": 0.17533363730688015, + "learning_rate": 1.8224053009412405e-05, + "loss": 0.3369, + "num_tokens": 1017707092.0, + "step": 1334 + }, + { + "epoch": 1.8220797371256654, + "grad_norm": 0.18328965178318862, + "learning_rate": 1.822108717895605e-05, + "loss": 0.3517, + "num_tokens": 1018451466.0, + "step": 1335 + }, + { + "epoch": 1.823445318084346, + "grad_norm": 0.19823621688171397, + "learning_rate": 1.821811914565872e-05, + "loss": 0.3329, + "num_tokens": 1019208986.0, + "step": 1336 + }, + { + "epoch": 1.8248108990430265, + "grad_norm": 0.17404287520777875, + "learning_rate": 1.8215148910425824e-05, + "loss": 0.3345, + "num_tokens": 1019981657.0, + "step": 1337 + }, + { + "epoch": 1.826176480001707, + "grad_norm": 0.17525925835324338, + "learning_rate": 1.821217647416346e-05, + "loss": 0.3358, + "num_tokens": 1020766462.0, + "step": 1338 + }, + { + "epoch": 1.8275420609603876, + "grad_norm": 0.19301643166854465, + "learning_rate": 1.8209201837778385e-05, + "loss": 0.3378, + "num_tokens": 1021524094.0, + "step": 1339 + }, + { + "epoch": 1.828907641919068, + "grad_norm": 0.1897757885147259, + "learning_rate": 1.8206225002178037e-05, + "loss": 0.3367, + "num_tokens": 1022304667.0, + "step": 1340 + }, + { + "epoch": 1.8302732228777485, + "grad_norm": 0.16908449685320562, + "learning_rate": 1.820324596827051e-05, + "loss": 0.3492, + "num_tokens": 1023037930.0, + "step": 1341 + }, + { + "epoch": 1.8316388038364289, + "grad_norm": 0.18655023969990409, + "learning_rate": 1.8200264736964588e-05, + "loss": 0.3545, + "num_tokens": 1023873838.0, + "step": 1342 + }, + { + "epoch": 1.8330043847951094, + "grad_norm": 0.17937181315784795, + "learning_rate": 1.819728130916971e-05, + "loss": 0.3418, + "num_tokens": 1024651306.0, + "step": 1343 + }, + { + "epoch": 1.83436996575379, + "grad_norm": 0.1850719424441285, + "learning_rate": 1.8194295685795997e-05, + "loss": 0.3619, + "num_tokens": 1025393758.0, + "step": 1344 + }, + { + "epoch": 1.8357355467124705, + "grad_norm": 0.1915486303578099, + "learning_rate": 1.819130786775422e-05, + "loss": 0.3356, + "num_tokens": 1026091695.0, + "step": 1345 + }, + { + "epoch": 1.837101127671151, + "grad_norm": 0.2051535179580251, + "learning_rate": 1.818831785595585e-05, + "loss": 0.3354, + "num_tokens": 1026878436.0, + "step": 1346 + }, + { + "epoch": 1.8384667086298316, + "grad_norm": 0.1766179344651269, + "learning_rate": 1.8185325651312997e-05, + "loss": 0.3519, + "num_tokens": 1027662285.0, + "step": 1347 + }, + { + "epoch": 1.8398322895885122, + "grad_norm": 0.18603370818283574, + "learning_rate": 1.818233125473846e-05, + "loss": 0.334, + "num_tokens": 1028417573.0, + "step": 1348 + }, + { + "epoch": 1.8411978705471925, + "grad_norm": 0.20698208737015042, + "learning_rate": 1.8179334667145698e-05, + "loss": 0.3609, + "num_tokens": 1029154389.0, + "step": 1349 + }, + { + "epoch": 1.842563451505873, + "grad_norm": 0.22286434119183543, + "learning_rate": 1.8176335889448833e-05, + "loss": 0.3437, + "num_tokens": 1029957505.0, + "step": 1350 + }, + { + "epoch": 1.8439290324645534, + "grad_norm": 0.19158961491637777, + "learning_rate": 1.817333492256268e-05, + "loss": 0.3447, + "num_tokens": 1030745318.0, + "step": 1351 + }, + { + "epoch": 1.845294613423234, + "grad_norm": 0.18335545141159662, + "learning_rate": 1.817033176740268e-05, + "loss": 0.3514, + "num_tokens": 1031487740.0, + "step": 1352 + }, + { + "epoch": 1.8466601943819145, + "grad_norm": 0.18280195300100643, + "learning_rate": 1.816732642488499e-05, + "loss": 0.3419, + "num_tokens": 1032266420.0, + "step": 1353 + }, + { + "epoch": 1.848025775340595, + "grad_norm": 0.2030145162230529, + "learning_rate": 1.8164318895926394e-05, + "loss": 0.3523, + "num_tokens": 1033022760.0, + "step": 1354 + }, + { + "epoch": 1.8493913562992756, + "grad_norm": 0.19180827359230376, + "learning_rate": 1.816130918144436e-05, + "loss": 0.3372, + "num_tokens": 1033757383.0, + "step": 1355 + }, + { + "epoch": 1.8507569372579562, + "grad_norm": 0.17436404994867266, + "learning_rate": 1.8158297282357027e-05, + "loss": 0.3591, + "num_tokens": 1034547784.0, + "step": 1356 + }, + { + "epoch": 1.8521225182166368, + "grad_norm": 0.21384976445213802, + "learning_rate": 1.8155283199583194e-05, + "loss": 0.3487, + "num_tokens": 1035322866.0, + "step": 1357 + }, + { + "epoch": 1.853488099175317, + "grad_norm": 0.1770464325798218, + "learning_rate": 1.8152266934042326e-05, + "loss": 0.3485, + "num_tokens": 1036103208.0, + "step": 1358 + }, + { + "epoch": 1.8548536801339977, + "grad_norm": 0.20411953272383584, + "learning_rate": 1.8149248486654554e-05, + "loss": 0.3436, + "num_tokens": 1036808074.0, + "step": 1359 + }, + { + "epoch": 1.856219261092678, + "grad_norm": 0.1840135784819275, + "learning_rate": 1.8146227858340674e-05, + "loss": 0.3556, + "num_tokens": 1037655441.0, + "step": 1360 + }, + { + "epoch": 1.8575848420513585, + "grad_norm": 0.1893308807019159, + "learning_rate": 1.8143205050022154e-05, + "loss": 0.3404, + "num_tokens": 1038422815.0, + "step": 1361 + }, + { + "epoch": 1.858950423010039, + "grad_norm": 0.18346126676299288, + "learning_rate": 1.8140180062621117e-05, + "loss": 0.3469, + "num_tokens": 1039246967.0, + "step": 1362 + }, + { + "epoch": 1.8603160039687197, + "grad_norm": 0.18867242184477462, + "learning_rate": 1.8137152897060358e-05, + "loss": 0.3371, + "num_tokens": 1039984720.0, + "step": 1363 + }, + { + "epoch": 1.8616815849274002, + "grad_norm": 0.1874502479784267, + "learning_rate": 1.8134123554263337e-05, + "loss": 0.35, + "num_tokens": 1040678355.0, + "step": 1364 + }, + { + "epoch": 1.8630471658860808, + "grad_norm": 0.19925499420288498, + "learning_rate": 1.813109203515417e-05, + "loss": 0.3405, + "num_tokens": 1041440149.0, + "step": 1365 + }, + { + "epoch": 1.8644127468447613, + "grad_norm": 0.1985077086730961, + "learning_rate": 1.8128058340657643e-05, + "loss": 0.3463, + "num_tokens": 1042158039.0, + "step": 1366 + }, + { + "epoch": 1.8657783278034417, + "grad_norm": 0.20502006617873098, + "learning_rate": 1.8125022471699208e-05, + "loss": 0.3472, + "num_tokens": 1042927338.0, + "step": 1367 + }, + { + "epoch": 1.8671439087621222, + "grad_norm": 0.1748512898524473, + "learning_rate": 1.8121984429204977e-05, + "loss": 0.3513, + "num_tokens": 1043710526.0, + "step": 1368 + }, + { + "epoch": 1.8685094897208026, + "grad_norm": 0.20156828741505178, + "learning_rate": 1.811894421410172e-05, + "loss": 0.3544, + "num_tokens": 1044452119.0, + "step": 1369 + }, + { + "epoch": 1.869875070679483, + "grad_norm": 0.1800226382191922, + "learning_rate": 1.8115901827316883e-05, + "loss": 0.3344, + "num_tokens": 1045251019.0, + "step": 1370 + }, + { + "epoch": 1.8712406516381637, + "grad_norm": 0.18593556528140423, + "learning_rate": 1.811285726977856e-05, + "loss": 0.3393, + "num_tokens": 1046008119.0, + "step": 1371 + }, + { + "epoch": 1.8726062325968442, + "grad_norm": 0.1928769049094261, + "learning_rate": 1.8109810542415513e-05, + "loss": 0.3407, + "num_tokens": 1046755215.0, + "step": 1372 + }, + { + "epoch": 1.8739718135555248, + "grad_norm": 0.19303744783676646, + "learning_rate": 1.810676164615717e-05, + "loss": 0.3482, + "num_tokens": 1047551893.0, + "step": 1373 + }, + { + "epoch": 1.8753373945142053, + "grad_norm": 0.1809117622696174, + "learning_rate": 1.810371058193362e-05, + "loss": 0.3456, + "num_tokens": 1048327530.0, + "step": 1374 + }, + { + "epoch": 1.876702975472886, + "grad_norm": 0.17724932920680705, + "learning_rate": 1.8100657350675604e-05, + "loss": 0.3417, + "num_tokens": 1049156457.0, + "step": 1375 + }, + { + "epoch": 1.8780685564315662, + "grad_norm": 0.19624854630084038, + "learning_rate": 1.8097601953314535e-05, + "loss": 0.3414, + "num_tokens": 1049866305.0, + "step": 1376 + }, + { + "epoch": 1.8794341373902468, + "grad_norm": 0.19646324670650459, + "learning_rate": 1.809454439078248e-05, + "loss": 0.352, + "num_tokens": 1050527086.0, + "step": 1377 + }, + { + "epoch": 1.8807997183489271, + "grad_norm": 0.19012771679197407, + "learning_rate": 1.8091484664012167e-05, + "loss": 0.3287, + "num_tokens": 1051300919.0, + "step": 1378 + }, + { + "epoch": 1.8821652993076077, + "grad_norm": 0.2020061458130394, + "learning_rate": 1.8088422773936997e-05, + "loss": 0.3303, + "num_tokens": 1052052554.0, + "step": 1379 + }, + { + "epoch": 1.8835308802662882, + "grad_norm": 0.18387821178084687, + "learning_rate": 1.8085358721491006e-05, + "loss": 0.3427, + "num_tokens": 1052805026.0, + "step": 1380 + }, + { + "epoch": 1.8848964612249688, + "grad_norm": 0.18378571661837326, + "learning_rate": 1.808229250760891e-05, + "loss": 0.359, + "num_tokens": 1053635092.0, + "step": 1381 + }, + { + "epoch": 1.8862620421836493, + "grad_norm": 0.19440042091365756, + "learning_rate": 1.8079224133226082e-05, + "loss": 0.33, + "num_tokens": 1054399214.0, + "step": 1382 + }, + { + "epoch": 1.88762762314233, + "grad_norm": 0.18666890565975022, + "learning_rate": 1.8076153599278544e-05, + "loss": 0.3413, + "num_tokens": 1055161105.0, + "step": 1383 + }, + { + "epoch": 1.8889932041010105, + "grad_norm": 0.18115374497396763, + "learning_rate": 1.8073080906702983e-05, + "loss": 0.3274, + "num_tokens": 1055888181.0, + "step": 1384 + }, + { + "epoch": 1.8903587850596908, + "grad_norm": 0.2517316823057435, + "learning_rate": 1.807000605643675e-05, + "loss": 0.3323, + "num_tokens": 1056594511.0, + "step": 1385 + }, + { + "epoch": 1.8917243660183714, + "grad_norm": 0.20535764923706187, + "learning_rate": 1.8066929049417844e-05, + "loss": 0.335, + "num_tokens": 1057373681.0, + "step": 1386 + }, + { + "epoch": 1.8930899469770517, + "grad_norm": 0.20568523614243436, + "learning_rate": 1.8063849886584922e-05, + "loss": 0.3372, + "num_tokens": 1058148174.0, + "step": 1387 + }, + { + "epoch": 1.8944555279357322, + "grad_norm": 0.1824517469086104, + "learning_rate": 1.8060768568877313e-05, + "loss": 0.3408, + "num_tokens": 1058931524.0, + "step": 1388 + }, + { + "epoch": 1.8958211088944128, + "grad_norm": 0.19623630817547652, + "learning_rate": 1.8057685097234995e-05, + "loss": 0.3405, + "num_tokens": 1059745866.0, + "step": 1389 + }, + { + "epoch": 1.8971866898530934, + "grad_norm": 0.18249090828949116, + "learning_rate": 1.805459947259859e-05, + "loss": 0.3248, + "num_tokens": 1060462436.0, + "step": 1390 + }, + { + "epoch": 1.898552270811774, + "grad_norm": 0.18266165607601342, + "learning_rate": 1.8051511695909396e-05, + "loss": 0.3393, + "num_tokens": 1061121671.0, + "step": 1391 + }, + { + "epoch": 1.8999178517704545, + "grad_norm": 0.18501811951464603, + "learning_rate": 1.8048421768109357e-05, + "loss": 0.3517, + "num_tokens": 1061955157.0, + "step": 1392 + }, + { + "epoch": 1.901283432729135, + "grad_norm": 0.18574217741211582, + "learning_rate": 1.804532969014108e-05, + "loss": 0.3546, + "num_tokens": 1062734127.0, + "step": 1393 + }, + { + "epoch": 1.9026490136878154, + "grad_norm": 0.18971508137208495, + "learning_rate": 1.8042235462947823e-05, + "loss": 0.3454, + "num_tokens": 1063546649.0, + "step": 1394 + }, + { + "epoch": 1.904014594646496, + "grad_norm": 0.20066473397012283, + "learning_rate": 1.8039139087473494e-05, + "loss": 0.3441, + "num_tokens": 1064301713.0, + "step": 1395 + }, + { + "epoch": 1.9053801756051763, + "grad_norm": 0.19623389172055286, + "learning_rate": 1.8036040564662674e-05, + "loss": 0.3451, + "num_tokens": 1065069050.0, + "step": 1396 + }, + { + "epoch": 1.9067457565638568, + "grad_norm": 0.18494830571207507, + "learning_rate": 1.803293989546058e-05, + "loss": 0.3417, + "num_tokens": 1065867837.0, + "step": 1397 + }, + { + "epoch": 1.9081113375225374, + "grad_norm": 0.20116005447955038, + "learning_rate": 1.8029837080813095e-05, + "loss": 0.3438, + "num_tokens": 1066635576.0, + "step": 1398 + }, + { + "epoch": 1.909476918481218, + "grad_norm": 0.18322351206848775, + "learning_rate": 1.8026732121666758e-05, + "loss": 0.3299, + "num_tokens": 1067400109.0, + "step": 1399 + }, + { + "epoch": 1.9108424994398985, + "grad_norm": 0.19603943252294853, + "learning_rate": 1.802362501896875e-05, + "loss": 0.3457, + "num_tokens": 1068169071.0, + "step": 1400 + }, + { + "epoch": 1.912208080398579, + "grad_norm": 0.18664887516882694, + "learning_rate": 1.802051577366691e-05, + "loss": 0.3416, + "num_tokens": 1068973662.0, + "step": 1401 + }, + { + "epoch": 1.9135736613572596, + "grad_norm": 0.19749219202331883, + "learning_rate": 1.8017404386709747e-05, + "loss": 0.3249, + "num_tokens": 1069687808.0, + "step": 1402 + }, + { + "epoch": 1.91493924231594, + "grad_norm": 0.17389219368672768, + "learning_rate": 1.80142908590464e-05, + "loss": 0.3398, + "num_tokens": 1070490203.0, + "step": 1403 + }, + { + "epoch": 1.9163048232746205, + "grad_norm": 0.2000966956102326, + "learning_rate": 1.8011175191626676e-05, + "loss": 0.3516, + "num_tokens": 1071259380.0, + "step": 1404 + }, + { + "epoch": 1.9176704042333008, + "grad_norm": 0.1825884978583894, + "learning_rate": 1.8008057385401028e-05, + "loss": 0.3367, + "num_tokens": 1071981465.0, + "step": 1405 + }, + { + "epoch": 1.9190359851919814, + "grad_norm": 0.20332807817071039, + "learning_rate": 1.800493744132057e-05, + "loss": 0.3562, + "num_tokens": 1072766456.0, + "step": 1406 + }, + { + "epoch": 1.920401566150662, + "grad_norm": 0.1815362987061042, + "learning_rate": 1.800181536033705e-05, + "loss": 0.3606, + "num_tokens": 1073561309.0, + "step": 1407 + }, + { + "epoch": 1.9217671471093425, + "grad_norm": 0.1952574265074895, + "learning_rate": 1.7998691143402888e-05, + "loss": 0.3375, + "num_tokens": 1074288618.0, + "step": 1408 + }, + { + "epoch": 1.923132728068023, + "grad_norm": 0.18999880229380975, + "learning_rate": 1.7995564791471146e-05, + "loss": 0.3381, + "num_tokens": 1075024005.0, + "step": 1409 + }, + { + "epoch": 1.9244983090267036, + "grad_norm": 0.18569266367046677, + "learning_rate": 1.799243630549554e-05, + "loss": 0.3388, + "num_tokens": 1075820395.0, + "step": 1410 + }, + { + "epoch": 1.9258638899853842, + "grad_norm": 0.18453835381493514, + "learning_rate": 1.7989305686430433e-05, + "loss": 0.3395, + "num_tokens": 1076589499.0, + "step": 1411 + }, + { + "epoch": 1.9272294709440645, + "grad_norm": 0.19825938852606126, + "learning_rate": 1.7986172935230843e-05, + "loss": 0.3377, + "num_tokens": 1077334267.0, + "step": 1412 + }, + { + "epoch": 1.928595051902745, + "grad_norm": 0.16833730569022778, + "learning_rate": 1.7983038052852434e-05, + "loss": 0.3356, + "num_tokens": 1078129944.0, + "step": 1413 + }, + { + "epoch": 1.9299606328614254, + "grad_norm": 0.19846511955057783, + "learning_rate": 1.797990104025153e-05, + "loss": 0.3289, + "num_tokens": 1078836245.0, + "step": 1414 + }, + { + "epoch": 1.931326213820106, + "grad_norm": 0.178309376252815, + "learning_rate": 1.7976761898385086e-05, + "loss": 0.3527, + "num_tokens": 1079587809.0, + "step": 1415 + }, + { + "epoch": 1.9326917947787865, + "grad_norm": 0.1933292302721321, + "learning_rate": 1.7973620628210725e-05, + "loss": 0.3511, + "num_tokens": 1080361621.0, + "step": 1416 + }, + { + "epoch": 1.934057375737467, + "grad_norm": 0.19036434205641373, + "learning_rate": 1.7970477230686717e-05, + "loss": 0.3517, + "num_tokens": 1081039509.0, + "step": 1417 + }, + { + "epoch": 1.9354229566961476, + "grad_norm": 0.19141049758501175, + "learning_rate": 1.796733170677197e-05, + "loss": 0.3487, + "num_tokens": 1081839421.0, + "step": 1418 + }, + { + "epoch": 1.9367885376548282, + "grad_norm": 0.17934238191391758, + "learning_rate": 1.7964184057426045e-05, + "loss": 0.338, + "num_tokens": 1082569390.0, + "step": 1419 + }, + { + "epoch": 1.9381541186135087, + "grad_norm": 0.19807521633435893, + "learning_rate": 1.796103428360916e-05, + "loss": 0.3497, + "num_tokens": 1083292577.0, + "step": 1420 + }, + { + "epoch": 1.939519699572189, + "grad_norm": 0.18305012040318092, + "learning_rate": 1.7957882386282173e-05, + "loss": 0.3464, + "num_tokens": 1084064757.0, + "step": 1421 + }, + { + "epoch": 1.9408852805308696, + "grad_norm": 0.18450581957006604, + "learning_rate": 1.7954728366406588e-05, + "loss": 0.3261, + "num_tokens": 1084831561.0, + "step": 1422 + }, + { + "epoch": 1.94225086148955, + "grad_norm": 0.18184013588411369, + "learning_rate": 1.7951572224944564e-05, + "loss": 0.3498, + "num_tokens": 1085676633.0, + "step": 1423 + }, + { + "epoch": 1.9436164424482305, + "grad_norm": 0.19873453653967915, + "learning_rate": 1.79484139628589e-05, + "loss": 0.3502, + "num_tokens": 1086491645.0, + "step": 1424 + }, + { + "epoch": 1.944982023406911, + "grad_norm": 0.1969652974872697, + "learning_rate": 1.7945253581113046e-05, + "loss": 0.3438, + "num_tokens": 1087234735.0, + "step": 1425 + }, + { + "epoch": 1.9463476043655916, + "grad_norm": 0.18346487266648323, + "learning_rate": 1.7942091080671097e-05, + "loss": 0.3396, + "num_tokens": 1087992936.0, + "step": 1426 + }, + { + "epoch": 1.9477131853242722, + "grad_norm": 0.18034067155746153, + "learning_rate": 1.7938926462497797e-05, + "loss": 0.3447, + "num_tokens": 1088704460.0, + "step": 1427 + }, + { + "epoch": 1.9490787662829527, + "grad_norm": 0.19684308966889566, + "learning_rate": 1.7935759727558536e-05, + "loss": 0.3405, + "num_tokens": 1089486597.0, + "step": 1428 + }, + { + "epoch": 1.9504443472416333, + "grad_norm": 0.17872811583953227, + "learning_rate": 1.793259087681934e-05, + "loss": 0.3329, + "num_tokens": 1090271761.0, + "step": 1429 + }, + { + "epoch": 1.9518099282003136, + "grad_norm": 0.1702694455587737, + "learning_rate": 1.792941991124689e-05, + "loss": 0.3524, + "num_tokens": 1091088504.0, + "step": 1430 + }, + { + "epoch": 1.9531755091589942, + "grad_norm": 0.19731605642692482, + "learning_rate": 1.7926246831808516e-05, + "loss": 0.345, + "num_tokens": 1091844749.0, + "step": 1431 + }, + { + "epoch": 1.9545410901176745, + "grad_norm": 0.20055886160422845, + "learning_rate": 1.7923071639472188e-05, + "loss": 0.3651, + "num_tokens": 1092633853.0, + "step": 1432 + }, + { + "epoch": 1.955906671076355, + "grad_norm": 0.17410827724062555, + "learning_rate": 1.791989433520651e-05, + "loss": 0.3522, + "num_tokens": 1093469460.0, + "step": 1433 + }, + { + "epoch": 1.9572722520350356, + "grad_norm": 0.20563115832867138, + "learning_rate": 1.7916714919980745e-05, + "loss": 0.3461, + "num_tokens": 1094146535.0, + "step": 1434 + }, + { + "epoch": 1.9586378329937162, + "grad_norm": 0.17590890805029433, + "learning_rate": 1.7913533394764798e-05, + "loss": 0.3383, + "num_tokens": 1094903981.0, + "step": 1435 + }, + { + "epoch": 1.9600034139523967, + "grad_norm": 0.20378783515659474, + "learning_rate": 1.7910349760529207e-05, + "loss": 0.326, + "num_tokens": 1095626160.0, + "step": 1436 + }, + { + "epoch": 1.9613689949110773, + "grad_norm": 0.1899386729268108, + "learning_rate": 1.7907164018245173e-05, + "loss": 0.3428, + "num_tokens": 1096460621.0, + "step": 1437 + }, + { + "epoch": 1.9627345758697579, + "grad_norm": 0.19309482311331763, + "learning_rate": 1.7903976168884517e-05, + "loss": 0.3396, + "num_tokens": 1097258051.0, + "step": 1438 + }, + { + "epoch": 1.9641001568284382, + "grad_norm": 0.19078208782549586, + "learning_rate": 1.790078621341972e-05, + "loss": 0.3433, + "num_tokens": 1098056109.0, + "step": 1439 + }, + { + "epoch": 1.9654657377871187, + "grad_norm": 0.2002654020758272, + "learning_rate": 1.7897594152823893e-05, + "loss": 0.3435, + "num_tokens": 1098865627.0, + "step": 1440 + }, + { + "epoch": 1.966831318745799, + "grad_norm": 0.20237718741666783, + "learning_rate": 1.7894399988070804e-05, + "loss": 0.3437, + "num_tokens": 1099577923.0, + "step": 1441 + }, + { + "epoch": 1.9681968997044796, + "grad_norm": 0.19323889990176754, + "learning_rate": 1.7891203720134846e-05, + "loss": 0.3343, + "num_tokens": 1100345883.0, + "step": 1442 + }, + { + "epoch": 1.9695624806631602, + "grad_norm": 0.17799084938808096, + "learning_rate": 1.788800534999107e-05, + "loss": 0.3367, + "num_tokens": 1101023823.0, + "step": 1443 + }, + { + "epoch": 1.9709280616218408, + "grad_norm": 0.22052593881454585, + "learning_rate": 1.7884804878615153e-05, + "loss": 0.3518, + "num_tokens": 1101822081.0, + "step": 1444 + }, + { + "epoch": 1.9722936425805213, + "grad_norm": 0.16561302725627677, + "learning_rate": 1.7881602306983423e-05, + "loss": 0.3294, + "num_tokens": 1102662227.0, + "step": 1445 + }, + { + "epoch": 1.9736592235392019, + "grad_norm": 0.19423322388077277, + "learning_rate": 1.7878397636072846e-05, + "loss": 0.3456, + "num_tokens": 1103413966.0, + "step": 1446 + }, + { + "epoch": 1.9750248044978824, + "grad_norm": 0.19888086294560858, + "learning_rate": 1.7875190866861027e-05, + "loss": 0.3289, + "num_tokens": 1104141719.0, + "step": 1447 + }, + { + "epoch": 1.9763903854565628, + "grad_norm": 0.1942868085005814, + "learning_rate": 1.7871982000326214e-05, + "loss": 0.3388, + "num_tokens": 1104907981.0, + "step": 1448 + }, + { + "epoch": 1.9777559664152433, + "grad_norm": 0.20019352253834014, + "learning_rate": 1.7868771037447298e-05, + "loss": 0.3479, + "num_tokens": 1105658789.0, + "step": 1449 + }, + { + "epoch": 1.9791215473739237, + "grad_norm": 0.18434046637128346, + "learning_rate": 1.7865557979203795e-05, + "loss": 0.3244, + "num_tokens": 1106411334.0, + "step": 1450 + }, + { + "epoch": 1.9804871283326042, + "grad_norm": 0.20166965159632833, + "learning_rate": 1.7862342826575877e-05, + "loss": 0.3504, + "num_tokens": 1107196407.0, + "step": 1451 + }, + { + "epoch": 1.9818527092912848, + "grad_norm": 0.17556391549314052, + "learning_rate": 1.785912558054434e-05, + "loss": 0.3487, + "num_tokens": 1107970681.0, + "step": 1452 + }, + { + "epoch": 1.9832182902499653, + "grad_norm": 0.22181043820844654, + "learning_rate": 1.785590624209064e-05, + "loss": 0.3418, + "num_tokens": 1108752574.0, + "step": 1453 + }, + { + "epoch": 1.9845838712086459, + "grad_norm": 0.18071141709683847, + "learning_rate": 1.7852684812196843e-05, + "loss": 0.3407, + "num_tokens": 1109515949.0, + "step": 1454 + }, + { + "epoch": 1.9859494521673264, + "grad_norm": 0.22858368979083815, + "learning_rate": 1.7849461291845675e-05, + "loss": 0.3491, + "num_tokens": 1110352057.0, + "step": 1455 + }, + { + "epoch": 1.987315033126007, + "grad_norm": 0.1904827435339122, + "learning_rate": 1.7846235682020495e-05, + "loss": 0.3487, + "num_tokens": 1111038880.0, + "step": 1456 + }, + { + "epoch": 1.9886806140846873, + "grad_norm": 0.21938854691078485, + "learning_rate": 1.7843007983705292e-05, + "loss": 0.3461, + "num_tokens": 1111785922.0, + "step": 1457 + }, + { + "epoch": 1.9900461950433679, + "grad_norm": 0.1953156546514855, + "learning_rate": 1.7839778197884696e-05, + "loss": 0.3412, + "num_tokens": 1112590900.0, + "step": 1458 + }, + { + "epoch": 1.9914117760020482, + "grad_norm": 0.20727646574345288, + "learning_rate": 1.783654632554398e-05, + "loss": 0.3424, + "num_tokens": 1113357660.0, + "step": 1459 + }, + { + "epoch": 1.9927773569607288, + "grad_norm": 0.18901603292721011, + "learning_rate": 1.7833312367669047e-05, + "loss": 0.3393, + "num_tokens": 1114126554.0, + "step": 1460 + }, + { + "epoch": 1.9941429379194093, + "grad_norm": 0.1734216241668799, + "learning_rate": 1.7830076325246437e-05, + "loss": 0.3335, + "num_tokens": 1114877291.0, + "step": 1461 + }, + { + "epoch": 1.9955085188780899, + "grad_norm": 0.17588500008733202, + "learning_rate": 1.7826838199263323e-05, + "loss": 0.327, + "num_tokens": 1115672422.0, + "step": 1462 + }, + { + "epoch": 1.9968740998367704, + "grad_norm": 0.18510264401799617, + "learning_rate": 1.782359799070752e-05, + "loss": 0.3425, + "num_tokens": 1116443311.0, + "step": 1463 + }, + { + "epoch": 1.998239680795451, + "grad_norm": 0.1778572692496335, + "learning_rate": 1.7820355700567476e-05, + "loss": 0.3466, + "num_tokens": 1117179618.0, + "step": 1464 + }, + { + "epoch": 1.9996052617541316, + "grad_norm": 0.20439157694766044, + "learning_rate": 1.781711132983227e-05, + "loss": 0.3366, + "num_tokens": 1117875132.0, + "step": 1465 + }, + { + "epoch": 2.0, + "grad_norm": 0.20439157694766044, + "learning_rate": 1.7813864879491623e-05, + "loss": 0.3518, + "num_tokens": 1118084272.0, + "step": 1466 + }, + { + "epoch": 2.0013655809586806, + "grad_norm": 0.378984760095693, + "learning_rate": 1.7810616350535883e-05, + "loss": 0.3097, + "num_tokens": 1118868163.0, + "step": 1467 + }, + { + "epoch": 2.002731161917361, + "grad_norm": 0.2640621758362693, + "learning_rate": 1.780736574395604e-05, + "loss": 0.2975, + "num_tokens": 1119643637.0, + "step": 1468 + }, + { + "epoch": 2.0040967428760417, + "grad_norm": 0.22581722400585014, + "learning_rate": 1.780411306074371e-05, + "loss": 0.3011, + "num_tokens": 1120381599.0, + "step": 1469 + }, + { + "epoch": 2.0054623238347222, + "grad_norm": 0.23257444807994682, + "learning_rate": 1.7800858301891148e-05, + "loss": 0.3033, + "num_tokens": 1121151170.0, + "step": 1470 + }, + { + "epoch": 2.0068279047934023, + "grad_norm": 0.29324911922030766, + "learning_rate": 1.7797601468391236e-05, + "loss": 0.3032, + "num_tokens": 1121915108.0, + "step": 1471 + }, + { + "epoch": 2.008193485752083, + "grad_norm": 0.24582390505301388, + "learning_rate": 1.779434256123749e-05, + "loss": 0.2916, + "num_tokens": 1122662888.0, + "step": 1472 + }, + { + "epoch": 2.0095590667107635, + "grad_norm": 0.20767418342239186, + "learning_rate": 1.7791081581424074e-05, + "loss": 0.299, + "num_tokens": 1123450898.0, + "step": 1473 + }, + { + "epoch": 2.010924647669444, + "grad_norm": 0.24061084540292718, + "learning_rate": 1.778781852994576e-05, + "loss": 0.3151, + "num_tokens": 1124287336.0, + "step": 1474 + }, + { + "epoch": 2.0122902286281246, + "grad_norm": 0.19464234188575347, + "learning_rate": 1.778455340779797e-05, + "loss": 0.2924, + "num_tokens": 1125036585.0, + "step": 1475 + }, + { + "epoch": 2.013655809586805, + "grad_norm": 0.1970460782928974, + "learning_rate": 1.7781286215976746e-05, + "loss": 0.3152, + "num_tokens": 1125847140.0, + "step": 1476 + }, + { + "epoch": 2.0150213905454857, + "grad_norm": 0.19148437894811834, + "learning_rate": 1.7778016955478774e-05, + "loss": 0.3083, + "num_tokens": 1126599912.0, + "step": 1477 + }, + { + "epoch": 2.0163869715041662, + "grad_norm": 0.23189445798120906, + "learning_rate": 1.777474562730136e-05, + "loss": 0.3037, + "num_tokens": 1127431146.0, + "step": 1478 + }, + { + "epoch": 2.017752552462847, + "grad_norm": 0.20899589593357878, + "learning_rate": 1.777147223244244e-05, + "loss": 0.2978, + "num_tokens": 1128139535.0, + "step": 1479 + }, + { + "epoch": 2.019118133421527, + "grad_norm": 0.20722180758078915, + "learning_rate": 1.7768196771900596e-05, + "loss": 0.2998, + "num_tokens": 1128817824.0, + "step": 1480 + }, + { + "epoch": 2.0204837143802075, + "grad_norm": 0.21459711140754276, + "learning_rate": 1.7764919246675017e-05, + "loss": 0.2936, + "num_tokens": 1129519989.0, + "step": 1481 + }, + { + "epoch": 2.021849295338888, + "grad_norm": 0.20640922068068807, + "learning_rate": 1.7761639657765542e-05, + "loss": 0.3071, + "num_tokens": 1130352398.0, + "step": 1482 + }, + { + "epoch": 2.0232148762975686, + "grad_norm": 0.2050668006547895, + "learning_rate": 1.7758358006172628e-05, + "loss": 0.313, + "num_tokens": 1131122937.0, + "step": 1483 + }, + { + "epoch": 2.024580457256249, + "grad_norm": 0.21239057980054127, + "learning_rate": 1.7755074292897367e-05, + "loss": 0.2939, + "num_tokens": 1131797648.0, + "step": 1484 + }, + { + "epoch": 2.0259460382149297, + "grad_norm": 0.2094454594661445, + "learning_rate": 1.7751788518941476e-05, + "loss": 0.2955, + "num_tokens": 1132611499.0, + "step": 1485 + }, + { + "epoch": 2.0273116191736102, + "grad_norm": 0.1971088257589147, + "learning_rate": 1.7748500685307305e-05, + "loss": 0.3051, + "num_tokens": 1133394765.0, + "step": 1486 + }, + { + "epoch": 2.028677200132291, + "grad_norm": 0.22310991391117552, + "learning_rate": 1.7745210792997824e-05, + "loss": 0.3149, + "num_tokens": 1134180473.0, + "step": 1487 + }, + { + "epoch": 2.0300427810909714, + "grad_norm": 0.18757419047144772, + "learning_rate": 1.7741918843016642e-05, + "loss": 0.3167, + "num_tokens": 1134952514.0, + "step": 1488 + }, + { + "epoch": 2.0314083620496515, + "grad_norm": 0.20468953171221313, + "learning_rate": 1.7738624836367988e-05, + "loss": 0.317, + "num_tokens": 1135780919.0, + "step": 1489 + }, + { + "epoch": 2.032773943008332, + "grad_norm": 0.20079980582375928, + "learning_rate": 1.7735328774056725e-05, + "loss": 0.2962, + "num_tokens": 1136597576.0, + "step": 1490 + }, + { + "epoch": 2.0341395239670126, + "grad_norm": 0.198013350097171, + "learning_rate": 1.773203065708833e-05, + "loss": 0.3019, + "num_tokens": 1137372645.0, + "step": 1491 + }, + { + "epoch": 2.035505104925693, + "grad_norm": 0.19362761099290093, + "learning_rate": 1.772873048646893e-05, + "loss": 0.3008, + "num_tokens": 1138161263.0, + "step": 1492 + }, + { + "epoch": 2.0368706858843737, + "grad_norm": 0.23347834819424357, + "learning_rate": 1.7725428263205244e-05, + "loss": 0.3192, + "num_tokens": 1138946272.0, + "step": 1493 + }, + { + "epoch": 2.0382362668430543, + "grad_norm": 0.22842501503438037, + "learning_rate": 1.7722123988304654e-05, + "loss": 0.3006, + "num_tokens": 1139664557.0, + "step": 1494 + }, + { + "epoch": 2.039601847801735, + "grad_norm": 0.17409942401504142, + "learning_rate": 1.7718817662775147e-05, + "loss": 0.2996, + "num_tokens": 1140406072.0, + "step": 1495 + }, + { + "epoch": 2.0409674287604154, + "grad_norm": 0.188789890262592, + "learning_rate": 1.7715509287625333e-05, + "loss": 0.2941, + "num_tokens": 1141156031.0, + "step": 1496 + }, + { + "epoch": 2.042333009719096, + "grad_norm": 0.1823414480332087, + "learning_rate": 1.7712198863864465e-05, + "loss": 0.2946, + "num_tokens": 1141864061.0, + "step": 1497 + }, + { + "epoch": 2.043698590677776, + "grad_norm": 0.21232879446319053, + "learning_rate": 1.77088863925024e-05, + "loss": 0.3041, + "num_tokens": 1142600217.0, + "step": 1498 + }, + { + "epoch": 2.0450641716364566, + "grad_norm": 0.1882634178488866, + "learning_rate": 1.7705571874549635e-05, + "loss": 0.3032, + "num_tokens": 1143355489.0, + "step": 1499 + }, + { + "epoch": 2.046429752595137, + "grad_norm": 0.19855595816415186, + "learning_rate": 1.7702255311017286e-05, + "loss": 0.3018, + "num_tokens": 1144092999.0, + "step": 1500 + }, + { + "epoch": 2.0477953335538177, + "grad_norm": 0.20322239670781705, + "learning_rate": 1.7698936702917087e-05, + "loss": 0.309, + "num_tokens": 1144978396.0, + "step": 1501 + }, + { + "epoch": 2.0491609145124983, + "grad_norm": 0.17967185800937913, + "learning_rate": 1.769561605126141e-05, + "loss": 0.3099, + "num_tokens": 1145779524.0, + "step": 1502 + }, + { + "epoch": 2.050526495471179, + "grad_norm": 0.200849557148035, + "learning_rate": 1.7692293357063243e-05, + "loss": 0.3195, + "num_tokens": 1146581057.0, + "step": 1503 + }, + { + "epoch": 2.0518920764298594, + "grad_norm": 0.18978824196135377, + "learning_rate": 1.7688968621336186e-05, + "loss": 0.3007, + "num_tokens": 1147314748.0, + "step": 1504 + }, + { + "epoch": 2.05325765738854, + "grad_norm": 0.19017984236375973, + "learning_rate": 1.7685641845094476e-05, + "loss": 0.3024, + "num_tokens": 1148057925.0, + "step": 1505 + }, + { + "epoch": 2.0546232383472205, + "grad_norm": 0.20010666869913468, + "learning_rate": 1.7682313029352972e-05, + "loss": 0.293, + "num_tokens": 1148809488.0, + "step": 1506 + }, + { + "epoch": 2.0559888193059006, + "grad_norm": 0.17567069348537093, + "learning_rate": 1.767898217512715e-05, + "loss": 0.2951, + "num_tokens": 1149529398.0, + "step": 1507 + }, + { + "epoch": 2.057354400264581, + "grad_norm": 0.2183021900390405, + "learning_rate": 1.7675649283433106e-05, + "loss": 0.308, + "num_tokens": 1150279434.0, + "step": 1508 + }, + { + "epoch": 2.0587199812232617, + "grad_norm": 0.1957513529284227, + "learning_rate": 1.767231435528757e-05, + "loss": 0.3062, + "num_tokens": 1151052985.0, + "step": 1509 + }, + { + "epoch": 2.0600855621819423, + "grad_norm": 0.1909157754813959, + "learning_rate": 1.766897739170787e-05, + "loss": 0.3023, + "num_tokens": 1151792037.0, + "step": 1510 + }, + { + "epoch": 2.061451143140623, + "grad_norm": 0.2004641695412717, + "learning_rate": 1.7665638393711976e-05, + "loss": 0.3173, + "num_tokens": 1152561644.0, + "step": 1511 + }, + { + "epoch": 2.0628167240993034, + "grad_norm": 0.20399169084360003, + "learning_rate": 1.7662297362318476e-05, + "loss": 0.313, + "num_tokens": 1153487062.0, + "step": 1512 + }, + { + "epoch": 2.064182305057984, + "grad_norm": 0.19565431384966142, + "learning_rate": 1.7658954298546566e-05, + "loss": 0.2973, + "num_tokens": 1154270702.0, + "step": 1513 + }, + { + "epoch": 2.0655478860166645, + "grad_norm": 0.21373902807733913, + "learning_rate": 1.765560920341608e-05, + "loss": 0.2994, + "num_tokens": 1155034975.0, + "step": 1514 + }, + { + "epoch": 2.066913466975345, + "grad_norm": 0.1893228737894599, + "learning_rate": 1.7652262077947455e-05, + "loss": 0.2991, + "num_tokens": 1155797178.0, + "step": 1515 + }, + { + "epoch": 2.068279047934025, + "grad_norm": 0.19730966089781782, + "learning_rate": 1.764891292316175e-05, + "loss": 0.3086, + "num_tokens": 1156566344.0, + "step": 1516 + }, + { + "epoch": 2.0696446288927057, + "grad_norm": 0.19229642037229025, + "learning_rate": 1.7645561740080658e-05, + "loss": 0.3023, + "num_tokens": 1157328685.0, + "step": 1517 + }, + { + "epoch": 2.0710102098513863, + "grad_norm": 0.18879758039314692, + "learning_rate": 1.764220852972647e-05, + "loss": 0.3108, + "num_tokens": 1158098621.0, + "step": 1518 + }, + { + "epoch": 2.072375790810067, + "grad_norm": 0.21871952260531508, + "learning_rate": 1.763885329312211e-05, + "loss": 0.2953, + "num_tokens": 1158761210.0, + "step": 1519 + }, + { + "epoch": 2.0737413717687474, + "grad_norm": 0.19695721818712614, + "learning_rate": 1.7635496031291118e-05, + "loss": 0.3117, + "num_tokens": 1159550811.0, + "step": 1520 + }, + { + "epoch": 2.075106952727428, + "grad_norm": 0.21311383262612893, + "learning_rate": 1.763213674525764e-05, + "loss": 0.3077, + "num_tokens": 1160326563.0, + "step": 1521 + }, + { + "epoch": 2.0764725336861085, + "grad_norm": 0.20409204890170182, + "learning_rate": 1.762877543604646e-05, + "loss": 0.2967, + "num_tokens": 1161088340.0, + "step": 1522 + }, + { + "epoch": 2.077838114644789, + "grad_norm": 0.19262208810561374, + "learning_rate": 1.762541210468296e-05, + "loss": 0.3218, + "num_tokens": 1161847490.0, + "step": 1523 + }, + { + "epoch": 2.0792036956034696, + "grad_norm": 0.22016797439713584, + "learning_rate": 1.762204675219315e-05, + "loss": 0.2948, + "num_tokens": 1162546306.0, + "step": 1524 + }, + { + "epoch": 2.0805692765621497, + "grad_norm": 0.5837846262899884, + "learning_rate": 1.761867937960365e-05, + "loss": 0.3115, + "num_tokens": 1163336518.0, + "step": 1525 + }, + { + "epoch": 2.0819348575208303, + "grad_norm": 0.22224571839733556, + "learning_rate": 1.7615309987941705e-05, + "loss": 0.2952, + "num_tokens": 1164100514.0, + "step": 1526 + }, + { + "epoch": 2.083300438479511, + "grad_norm": 0.19730989018900974, + "learning_rate": 1.7611938578235167e-05, + "loss": 0.302, + "num_tokens": 1164823377.0, + "step": 1527 + }, + { + "epoch": 2.0846660194381914, + "grad_norm": 0.200725181027616, + "learning_rate": 1.7608565151512504e-05, + "loss": 0.2993, + "num_tokens": 1165628898.0, + "step": 1528 + }, + { + "epoch": 2.086031600396872, + "grad_norm": 0.2235255616948425, + "learning_rate": 1.760518970880281e-05, + "loss": 0.3126, + "num_tokens": 1166381108.0, + "step": 1529 + }, + { + "epoch": 2.0873971813555525, + "grad_norm": 0.20152611880984028, + "learning_rate": 1.7601812251135776e-05, + "loss": 0.2993, + "num_tokens": 1167159948.0, + "step": 1530 + }, + { + "epoch": 2.088762762314233, + "grad_norm": 0.20238639208630643, + "learning_rate": 1.7598432779541733e-05, + "loss": 0.3031, + "num_tokens": 1167897464.0, + "step": 1531 + }, + { + "epoch": 2.0901283432729136, + "grad_norm": 0.18577616050319412, + "learning_rate": 1.7595051295051594e-05, + "loss": 0.3009, + "num_tokens": 1168608133.0, + "step": 1532 + }, + { + "epoch": 2.091493924231594, + "grad_norm": 0.1989350837375857, + "learning_rate": 1.759166779869692e-05, + "loss": 0.3062, + "num_tokens": 1169426566.0, + "step": 1533 + }, + { + "epoch": 2.0928595051902743, + "grad_norm": 0.22239839062230013, + "learning_rate": 1.7588282291509858e-05, + "loss": 0.3008, + "num_tokens": 1170136846.0, + "step": 1534 + }, + { + "epoch": 2.094225086148955, + "grad_norm": 0.1958811077056453, + "learning_rate": 1.7584894774523185e-05, + "loss": 0.3132, + "num_tokens": 1170893379.0, + "step": 1535 + }, + { + "epoch": 2.0955906671076354, + "grad_norm": 0.21759500280643995, + "learning_rate": 1.7581505248770278e-05, + "loss": 0.3077, + "num_tokens": 1171636982.0, + "step": 1536 + }, + { + "epoch": 2.096956248066316, + "grad_norm": 0.224790581265609, + "learning_rate": 1.7578113715285147e-05, + "loss": 0.3259, + "num_tokens": 1172436991.0, + "step": 1537 + }, + { + "epoch": 2.0983218290249965, + "grad_norm": 0.20085084270047168, + "learning_rate": 1.7574720175102392e-05, + "loss": 0.2975, + "num_tokens": 1173197965.0, + "step": 1538 + }, + { + "epoch": 2.099687409983677, + "grad_norm": 0.20297960791562641, + "learning_rate": 1.757132462925724e-05, + "loss": 0.3093, + "num_tokens": 1174004921.0, + "step": 1539 + }, + { + "epoch": 2.1010529909423576, + "grad_norm": 0.2199653782129971, + "learning_rate": 1.756792707878552e-05, + "loss": 0.3095, + "num_tokens": 1174717386.0, + "step": 1540 + }, + { + "epoch": 2.102418571901038, + "grad_norm": 0.21154837187725112, + "learning_rate": 1.7564527524723685e-05, + "loss": 0.3028, + "num_tokens": 1175500615.0, + "step": 1541 + }, + { + "epoch": 2.1037841528597188, + "grad_norm": 0.19899016156145555, + "learning_rate": 1.7561125968108787e-05, + "loss": 0.299, + "num_tokens": 1176259304.0, + "step": 1542 + }, + { + "epoch": 2.105149733818399, + "grad_norm": 0.20454042291989116, + "learning_rate": 1.7557722409978495e-05, + "loss": 0.3069, + "num_tokens": 1177018979.0, + "step": 1543 + }, + { + "epoch": 2.1065153147770794, + "grad_norm": 0.19512931370978842, + "learning_rate": 1.7554316851371083e-05, + "loss": 0.3011, + "num_tokens": 1177743745.0, + "step": 1544 + }, + { + "epoch": 2.10788089573576, + "grad_norm": 0.2084311804297861, + "learning_rate": 1.755090929332545e-05, + "loss": 0.3011, + "num_tokens": 1178499475.0, + "step": 1545 + }, + { + "epoch": 2.1092464766944405, + "grad_norm": 0.1908326879518676, + "learning_rate": 1.754749973688108e-05, + "loss": 0.3126, + "num_tokens": 1179308225.0, + "step": 1546 + }, + { + "epoch": 2.110612057653121, + "grad_norm": 0.21271843018130182, + "learning_rate": 1.754408818307809e-05, + "loss": 0.2967, + "num_tokens": 1180078786.0, + "step": 1547 + }, + { + "epoch": 2.1119776386118017, + "grad_norm": 0.2101363786054184, + "learning_rate": 1.75406746329572e-05, + "loss": 0.3186, + "num_tokens": 1180893609.0, + "step": 1548 + }, + { + "epoch": 2.113343219570482, + "grad_norm": 0.18976509993678678, + "learning_rate": 1.7537259087559727e-05, + "loss": 0.3132, + "num_tokens": 1181647251.0, + "step": 1549 + }, + { + "epoch": 2.1147088005291628, + "grad_norm": 0.21991550872802687, + "learning_rate": 1.7533841547927616e-05, + "loss": 0.2997, + "num_tokens": 1182372341.0, + "step": 1550 + }, + { + "epoch": 2.1160743814878433, + "grad_norm": 0.18699414863598035, + "learning_rate": 1.75304220151034e-05, + "loss": 0.3053, + "num_tokens": 1183165313.0, + "step": 1551 + }, + { + "epoch": 2.1174399624465234, + "grad_norm": 0.19978010577754518, + "learning_rate": 1.7527000490130238e-05, + "loss": 0.3158, + "num_tokens": 1183867519.0, + "step": 1552 + }, + { + "epoch": 2.118805543405204, + "grad_norm": 0.21520606221239402, + "learning_rate": 1.7523576974051887e-05, + "loss": 0.3211, + "num_tokens": 1184646914.0, + "step": 1553 + }, + { + "epoch": 2.1201711243638846, + "grad_norm": 0.20855571815746415, + "learning_rate": 1.7520151467912714e-05, + "loss": 0.3081, + "num_tokens": 1185371483.0, + "step": 1554 + }, + { + "epoch": 2.121536705322565, + "grad_norm": 0.19249136333229466, + "learning_rate": 1.751672397275769e-05, + "loss": 0.3179, + "num_tokens": 1186092583.0, + "step": 1555 + }, + { + "epoch": 2.1229022862812457, + "grad_norm": 0.2066034868930569, + "learning_rate": 1.75132944896324e-05, + "loss": 0.3159, + "num_tokens": 1186919447.0, + "step": 1556 + }, + { + "epoch": 2.124267867239926, + "grad_norm": 0.19367397483400037, + "learning_rate": 1.7509863019583028e-05, + "loss": 0.3005, + "num_tokens": 1187619757.0, + "step": 1557 + }, + { + "epoch": 2.125633448198607, + "grad_norm": 0.2125593134694305, + "learning_rate": 1.7506429563656365e-05, + "loss": 0.2932, + "num_tokens": 1188367092.0, + "step": 1558 + }, + { + "epoch": 2.1269990291572873, + "grad_norm": 0.18227705087469964, + "learning_rate": 1.750299412289981e-05, + "loss": 0.2842, + "num_tokens": 1189046025.0, + "step": 1559 + }, + { + "epoch": 2.128364610115968, + "grad_norm": 0.19563572842176388, + "learning_rate": 1.749955669836137e-05, + "loss": 0.3019, + "num_tokens": 1189811291.0, + "step": 1560 + }, + { + "epoch": 2.129730191074648, + "grad_norm": 0.2042754187933906, + "learning_rate": 1.7496117291089655e-05, + "loss": 0.3266, + "num_tokens": 1190641692.0, + "step": 1561 + }, + { + "epoch": 2.1310957720333286, + "grad_norm": 0.18900395377253343, + "learning_rate": 1.7492675902133875e-05, + "loss": 0.2998, + "num_tokens": 1191389977.0, + "step": 1562 + }, + { + "epoch": 2.132461352992009, + "grad_norm": 0.18050829606039712, + "learning_rate": 1.7489232532543854e-05, + "loss": 0.3024, + "num_tokens": 1192157715.0, + "step": 1563 + }, + { + "epoch": 2.1338269339506897, + "grad_norm": 0.21298724928791526, + "learning_rate": 1.7485787183370008e-05, + "loss": 0.2911, + "num_tokens": 1192861461.0, + "step": 1564 + }, + { + "epoch": 2.1351925149093702, + "grad_norm": 0.18663028336315485, + "learning_rate": 1.7482339855663373e-05, + "loss": 0.3052, + "num_tokens": 1193585101.0, + "step": 1565 + }, + { + "epoch": 2.136558095868051, + "grad_norm": 0.18638920384754878, + "learning_rate": 1.747889055047557e-05, + "loss": 0.2981, + "num_tokens": 1194398715.0, + "step": 1566 + }, + { + "epoch": 2.1379236768267313, + "grad_norm": 0.18299159205711685, + "learning_rate": 1.747543926885884e-05, + "loss": 0.2997, + "num_tokens": 1195098929.0, + "step": 1567 + }, + { + "epoch": 2.139289257785412, + "grad_norm": 0.19668850772092272, + "learning_rate": 1.7471986011866017e-05, + "loss": 0.3049, + "num_tokens": 1195892171.0, + "step": 1568 + }, + { + "epoch": 2.1406548387440925, + "grad_norm": 0.19406109983595884, + "learning_rate": 1.7468530780550537e-05, + "loss": 0.2986, + "num_tokens": 1196738073.0, + "step": 1569 + }, + { + "epoch": 2.1420204197027726, + "grad_norm": 0.2013969570263123, + "learning_rate": 1.7465073575966445e-05, + "loss": 0.3001, + "num_tokens": 1197411356.0, + "step": 1570 + }, + { + "epoch": 2.143386000661453, + "grad_norm": 0.19124986157896903, + "learning_rate": 1.7461614399168383e-05, + "loss": 0.3143, + "num_tokens": 1198278619.0, + "step": 1571 + }, + { + "epoch": 2.1447515816201337, + "grad_norm": 0.1970164319044843, + "learning_rate": 1.7458153251211595e-05, + "loss": 0.3047, + "num_tokens": 1199040956.0, + "step": 1572 + }, + { + "epoch": 2.1461171625788142, + "grad_norm": 0.1926111711825931, + "learning_rate": 1.745469013315193e-05, + "loss": 0.2959, + "num_tokens": 1199756148.0, + "step": 1573 + }, + { + "epoch": 2.147482743537495, + "grad_norm": 0.20020157635562233, + "learning_rate": 1.7451225046045838e-05, + "loss": 0.3163, + "num_tokens": 1200538061.0, + "step": 1574 + }, + { + "epoch": 2.1488483244961754, + "grad_norm": 0.19638328177129943, + "learning_rate": 1.744775799095036e-05, + "loss": 0.3015, + "num_tokens": 1201316915.0, + "step": 1575 + }, + { + "epoch": 2.150213905454856, + "grad_norm": 0.18862844610352159, + "learning_rate": 1.744428896892315e-05, + "loss": 0.2992, + "num_tokens": 1202098701.0, + "step": 1576 + }, + { + "epoch": 2.1515794864135365, + "grad_norm": 0.20415456495265924, + "learning_rate": 1.7440817981022447e-05, + "loss": 0.3151, + "num_tokens": 1202893733.0, + "step": 1577 + }, + { + "epoch": 2.152945067372217, + "grad_norm": 0.1972198785915502, + "learning_rate": 1.7437345028307112e-05, + "loss": 0.3095, + "num_tokens": 1203657787.0, + "step": 1578 + }, + { + "epoch": 2.154310648330897, + "grad_norm": 0.21494962523549674, + "learning_rate": 1.743387011183659e-05, + "loss": 0.3123, + "num_tokens": 1204430801.0, + "step": 1579 + }, + { + "epoch": 2.1556762292895777, + "grad_norm": 0.19837030966970584, + "learning_rate": 1.7430393232670917e-05, + "loss": 0.3083, + "num_tokens": 1205279288.0, + "step": 1580 + }, + { + "epoch": 2.1570418102482583, + "grad_norm": 0.20019114996411247, + "learning_rate": 1.742691439187075e-05, + "loss": 0.2912, + "num_tokens": 1206006243.0, + "step": 1581 + }, + { + "epoch": 2.158407391206939, + "grad_norm": 0.20122898432263508, + "learning_rate": 1.7423433590497324e-05, + "loss": 0.2999, + "num_tokens": 1206698774.0, + "step": 1582 + }, + { + "epoch": 2.1597729721656194, + "grad_norm": 0.18935202120348568, + "learning_rate": 1.7419950829612487e-05, + "loss": 0.3137, + "num_tokens": 1207492312.0, + "step": 1583 + }, + { + "epoch": 2.1611385531243, + "grad_norm": 0.20394416353624736, + "learning_rate": 1.7416466110278677e-05, + "loss": 0.3107, + "num_tokens": 1208340905.0, + "step": 1584 + }, + { + "epoch": 2.1625041340829805, + "grad_norm": 0.20193750752191297, + "learning_rate": 1.741297943355893e-05, + "loss": 0.3004, + "num_tokens": 1209060695.0, + "step": 1585 + }, + { + "epoch": 2.163869715041661, + "grad_norm": 0.19997651054914212, + "learning_rate": 1.7409490800516882e-05, + "loss": 0.2969, + "num_tokens": 1209811137.0, + "step": 1586 + }, + { + "epoch": 2.1652352960003416, + "grad_norm": 0.19984011162803272, + "learning_rate": 1.740600021221676e-05, + "loss": 0.3131, + "num_tokens": 1210503848.0, + "step": 1587 + }, + { + "epoch": 2.1666008769590217, + "grad_norm": 0.1865080080772543, + "learning_rate": 1.7402507669723392e-05, + "loss": 0.3055, + "num_tokens": 1211333338.0, + "step": 1588 + }, + { + "epoch": 2.1679664579177023, + "grad_norm": 0.21849652876266337, + "learning_rate": 1.7399013174102208e-05, + "loss": 0.3079, + "num_tokens": 1212059086.0, + "step": 1589 + }, + { + "epoch": 2.169332038876383, + "grad_norm": 0.19010318637060003, + "learning_rate": 1.7395516726419217e-05, + "loss": 0.308, + "num_tokens": 1212804267.0, + "step": 1590 + }, + { + "epoch": 2.1706976198350634, + "grad_norm": 0.21516131046962358, + "learning_rate": 1.739201832774104e-05, + "loss": 0.3293, + "num_tokens": 1213615914.0, + "step": 1591 + }, + { + "epoch": 2.172063200793744, + "grad_norm": 0.17783115271461442, + "learning_rate": 1.738851797913489e-05, + "loss": 0.3098, + "num_tokens": 1214404658.0, + "step": 1592 + }, + { + "epoch": 2.1734287817524245, + "grad_norm": 0.1983888448186577, + "learning_rate": 1.7385015681668563e-05, + "loss": 0.3195, + "num_tokens": 1215201452.0, + "step": 1593 + }, + { + "epoch": 2.174794362711105, + "grad_norm": 0.20797598730458372, + "learning_rate": 1.7381511436410463e-05, + "loss": 0.3002, + "num_tokens": 1215897206.0, + "step": 1594 + }, + { + "epoch": 2.1761599436697856, + "grad_norm": 0.1919074603999815, + "learning_rate": 1.7378005244429587e-05, + "loss": 0.2969, + "num_tokens": 1216639890.0, + "step": 1595 + }, + { + "epoch": 2.177525524628466, + "grad_norm": 0.1942675409140386, + "learning_rate": 1.7374497106795516e-05, + "loss": 0.3147, + "num_tokens": 1217393107.0, + "step": 1596 + }, + { + "epoch": 2.1788911055871463, + "grad_norm": 0.2050117570320177, + "learning_rate": 1.7370987024578438e-05, + "loss": 0.3154, + "num_tokens": 1218117690.0, + "step": 1597 + }, + { + "epoch": 2.180256686545827, + "grad_norm": 0.18749249912544938, + "learning_rate": 1.736747499884912e-05, + "loss": 0.3241, + "num_tokens": 1218954238.0, + "step": 1598 + }, + { + "epoch": 2.1816222675045074, + "grad_norm": 0.18688078960573537, + "learning_rate": 1.736396103067893e-05, + "loss": 0.2895, + "num_tokens": 1219750936.0, + "step": 1599 + }, + { + "epoch": 2.182987848463188, + "grad_norm": 0.19025568548952487, + "learning_rate": 1.736044512113983e-05, + "loss": 0.3072, + "num_tokens": 1220509174.0, + "step": 1600 + }, + { + "epoch": 2.1843534294218685, + "grad_norm": 0.18586090830660376, + "learning_rate": 1.7356927271304372e-05, + "loss": 0.3045, + "num_tokens": 1221262257.0, + "step": 1601 + }, + { + "epoch": 2.185719010380549, + "grad_norm": 0.1896106330865507, + "learning_rate": 1.7353407482245698e-05, + "loss": 0.2897, + "num_tokens": 1221994116.0, + "step": 1602 + }, + { + "epoch": 2.1870845913392296, + "grad_norm": 0.1787010912940222, + "learning_rate": 1.7349885755037546e-05, + "loss": 0.2924, + "num_tokens": 1222697836.0, + "step": 1603 + }, + { + "epoch": 2.18845017229791, + "grad_norm": 0.2033940100402742, + "learning_rate": 1.7346362090754237e-05, + "loss": 0.2945, + "num_tokens": 1223480523.0, + "step": 1604 + }, + { + "epoch": 2.1898157532565907, + "grad_norm": 0.20025247514148492, + "learning_rate": 1.7342836490470692e-05, + "loss": 0.3027, + "num_tokens": 1224216506.0, + "step": 1605 + }, + { + "epoch": 2.191181334215271, + "grad_norm": 0.20670234173746446, + "learning_rate": 1.733930895526242e-05, + "loss": 0.2986, + "num_tokens": 1224956603.0, + "step": 1606 + }, + { + "epoch": 2.1925469151739514, + "grad_norm": 0.1822251451955554, + "learning_rate": 1.733577948620552e-05, + "loss": 0.3115, + "num_tokens": 1225770045.0, + "step": 1607 + }, + { + "epoch": 2.193912496132632, + "grad_norm": 0.18229291883434315, + "learning_rate": 1.7332248084376677e-05, + "loss": 0.3036, + "num_tokens": 1226504346.0, + "step": 1608 + }, + { + "epoch": 2.1952780770913125, + "grad_norm": 0.18697885462128633, + "learning_rate": 1.7328714750853166e-05, + "loss": 0.3116, + "num_tokens": 1227269905.0, + "step": 1609 + }, + { + "epoch": 2.196643658049993, + "grad_norm": 0.19786720943895045, + "learning_rate": 1.732517948671286e-05, + "loss": 0.2941, + "num_tokens": 1228014620.0, + "step": 1610 + }, + { + "epoch": 2.1980092390086736, + "grad_norm": 0.1906529325901666, + "learning_rate": 1.7321642293034215e-05, + "loss": 0.3183, + "num_tokens": 1228827411.0, + "step": 1611 + }, + { + "epoch": 2.199374819967354, + "grad_norm": 0.19223377948122866, + "learning_rate": 1.731810317089627e-05, + "loss": 0.2996, + "num_tokens": 1229550333.0, + "step": 1612 + }, + { + "epoch": 2.2007404009260347, + "grad_norm": 0.19067789112880787, + "learning_rate": 1.7314562121378664e-05, + "loss": 0.3065, + "num_tokens": 1230246583.0, + "step": 1613 + }, + { + "epoch": 2.2021059818847153, + "grad_norm": 0.19948609741612505, + "learning_rate": 1.7311019145561615e-05, + "loss": 0.306, + "num_tokens": 1231011477.0, + "step": 1614 + }, + { + "epoch": 2.2034715628433954, + "grad_norm": 0.2154389142213592, + "learning_rate": 1.730747424452593e-05, + "loss": 0.3183, + "num_tokens": 1231789421.0, + "step": 1615 + }, + { + "epoch": 2.204837143802076, + "grad_norm": 0.2144109103560242, + "learning_rate": 1.7303927419353008e-05, + "loss": 0.3096, + "num_tokens": 1232553357.0, + "step": 1616 + }, + { + "epoch": 2.2062027247607565, + "grad_norm": 0.18616152611126027, + "learning_rate": 1.730037867112483e-05, + "loss": 0.3048, + "num_tokens": 1233326954.0, + "step": 1617 + }, + { + "epoch": 2.207568305719437, + "grad_norm": 0.21542358680608134, + "learning_rate": 1.7296828000923967e-05, + "loss": 0.3088, + "num_tokens": 1234112353.0, + "step": 1618 + }, + { + "epoch": 2.2089338866781176, + "grad_norm": 0.18655179628888083, + "learning_rate": 1.7293275409833572e-05, + "loss": 0.3135, + "num_tokens": 1234852535.0, + "step": 1619 + }, + { + "epoch": 2.210299467636798, + "grad_norm": 0.2126899973965063, + "learning_rate": 1.7289720898937384e-05, + "loss": 0.3003, + "num_tokens": 1235656199.0, + "step": 1620 + }, + { + "epoch": 2.2116650485954787, + "grad_norm": 0.19741803874379485, + "learning_rate": 1.728616446931974e-05, + "loss": 0.2989, + "num_tokens": 1236356066.0, + "step": 1621 + }, + { + "epoch": 2.2130306295541593, + "grad_norm": 0.19424926932504472, + "learning_rate": 1.7282606122065545e-05, + "loss": 0.3196, + "num_tokens": 1237138317.0, + "step": 1622 + }, + { + "epoch": 2.21439621051284, + "grad_norm": 0.20728473533469957, + "learning_rate": 1.72790458582603e-05, + "loss": 0.3178, + "num_tokens": 1237961751.0, + "step": 1623 + }, + { + "epoch": 2.21576179147152, + "grad_norm": 0.20385084512263607, + "learning_rate": 1.7275483678990085e-05, + "loss": 0.3078, + "num_tokens": 1238689402.0, + "step": 1624 + }, + { + "epoch": 2.2171273724302005, + "grad_norm": 0.21621085657162664, + "learning_rate": 1.7271919585341568e-05, + "loss": 0.2948, + "num_tokens": 1239455746.0, + "step": 1625 + }, + { + "epoch": 2.218492953388881, + "grad_norm": 0.2020174550806951, + "learning_rate": 1.7268353578401998e-05, + "loss": 0.3059, + "num_tokens": 1240189374.0, + "step": 1626 + }, + { + "epoch": 2.2198585343475616, + "grad_norm": 0.2068754833803503, + "learning_rate": 1.726478565925921e-05, + "loss": 0.3138, + "num_tokens": 1240951640.0, + "step": 1627 + }, + { + "epoch": 2.221224115306242, + "grad_norm": 0.19063116738003477, + "learning_rate": 1.7261215829001626e-05, + "loss": 0.3007, + "num_tokens": 1241720430.0, + "step": 1628 + }, + { + "epoch": 2.2225896962649228, + "grad_norm": 0.2198151864235673, + "learning_rate": 1.7257644088718237e-05, + "loss": 0.313, + "num_tokens": 1242548477.0, + "step": 1629 + }, + { + "epoch": 2.2239552772236033, + "grad_norm": 0.1944092581464973, + "learning_rate": 1.725407043949864e-05, + "loss": 0.3023, + "num_tokens": 1243335787.0, + "step": 1630 + }, + { + "epoch": 2.225320858182284, + "grad_norm": 0.19795640152887986, + "learning_rate": 1.7250494882432986e-05, + "loss": 0.3163, + "num_tokens": 1244055332.0, + "step": 1631 + }, + { + "epoch": 2.2266864391409644, + "grad_norm": 0.1952205144872384, + "learning_rate": 1.7246917418612033e-05, + "loss": 0.3204, + "num_tokens": 1244831964.0, + "step": 1632 + }, + { + "epoch": 2.2280520200996445, + "grad_norm": 0.20040040043327595, + "learning_rate": 1.7243338049127103e-05, + "loss": 0.3131, + "num_tokens": 1245681481.0, + "step": 1633 + }, + { + "epoch": 2.229417601058325, + "grad_norm": 0.20349222040699316, + "learning_rate": 1.7239756775070112e-05, + "loss": 0.3098, + "num_tokens": 1246501306.0, + "step": 1634 + }, + { + "epoch": 2.2307831820170057, + "grad_norm": 0.18083847955470475, + "learning_rate": 1.723617359753355e-05, + "loss": 0.3001, + "num_tokens": 1247262390.0, + "step": 1635 + }, + { + "epoch": 2.232148762975686, + "grad_norm": 0.196535102643425, + "learning_rate": 1.723258851761049e-05, + "loss": 0.3035, + "num_tokens": 1247979818.0, + "step": 1636 + }, + { + "epoch": 2.2335143439343668, + "grad_norm": 0.22422620668025509, + "learning_rate": 1.722900153639458e-05, + "loss": 0.3104, + "num_tokens": 1248698103.0, + "step": 1637 + }, + { + "epoch": 2.2348799248930473, + "grad_norm": 0.19365652452317503, + "learning_rate": 1.722541265498006e-05, + "loss": 0.2936, + "num_tokens": 1249361491.0, + "step": 1638 + }, + { + "epoch": 2.236245505851728, + "grad_norm": 0.20718937873001325, + "learning_rate": 1.7221821874461737e-05, + "loss": 0.309, + "num_tokens": 1250131505.0, + "step": 1639 + }, + { + "epoch": 2.2376110868104084, + "grad_norm": 0.21751870221977251, + "learning_rate": 1.7218229195935004e-05, + "loss": 0.3091, + "num_tokens": 1250833679.0, + "step": 1640 + }, + { + "epoch": 2.238976667769089, + "grad_norm": 0.20252562127977577, + "learning_rate": 1.7214634620495835e-05, + "loss": 0.3088, + "num_tokens": 1251607518.0, + "step": 1641 + }, + { + "epoch": 2.240342248727769, + "grad_norm": 0.19491066411379782, + "learning_rate": 1.7211038149240774e-05, + "loss": 0.3091, + "num_tokens": 1252444863.0, + "step": 1642 + }, + { + "epoch": 2.2417078296864497, + "grad_norm": 0.1849233160627862, + "learning_rate": 1.7207439783266952e-05, + "loss": 0.3065, + "num_tokens": 1253205956.0, + "step": 1643 + }, + { + "epoch": 2.24307341064513, + "grad_norm": 0.21182690241234958, + "learning_rate": 1.720383952367207e-05, + "loss": 0.3217, + "num_tokens": 1253956104.0, + "step": 1644 + }, + { + "epoch": 2.2444389916038108, + "grad_norm": 0.18727600746545803, + "learning_rate": 1.720023737155442e-05, + "loss": 0.3064, + "num_tokens": 1254688382.0, + "step": 1645 + }, + { + "epoch": 2.2458045725624913, + "grad_norm": 0.21150420571988576, + "learning_rate": 1.7196633328012856e-05, + "loss": 0.2931, + "num_tokens": 1255526705.0, + "step": 1646 + }, + { + "epoch": 2.247170153521172, + "grad_norm": 0.18794664641355024, + "learning_rate": 1.7193027394146812e-05, + "loss": 0.3156, + "num_tokens": 1256340114.0, + "step": 1647 + }, + { + "epoch": 2.2485357344798524, + "grad_norm": 0.1816237566229095, + "learning_rate": 1.7189419571056315e-05, + "loss": 0.3119, + "num_tokens": 1257082983.0, + "step": 1648 + }, + { + "epoch": 2.249901315438533, + "grad_norm": 0.2066866483463397, + "learning_rate": 1.718580985984194e-05, + "loss": 0.311, + "num_tokens": 1257865306.0, + "step": 1649 + }, + { + "epoch": 2.2512668963972136, + "grad_norm": 0.19960621184870297, + "learning_rate": 1.7182198261604866e-05, + "loss": 0.3176, + "num_tokens": 1258645258.0, + "step": 1650 + }, + { + "epoch": 2.2526324773558937, + "grad_norm": 0.18158738690363585, + "learning_rate": 1.7178584777446834e-05, + "loss": 0.3004, + "num_tokens": 1259425592.0, + "step": 1651 + }, + { + "epoch": 2.2539980583145742, + "grad_norm": 0.19713885713576884, + "learning_rate": 1.717496940847015e-05, + "loss": 0.3002, + "num_tokens": 1260204885.0, + "step": 1652 + }, + { + "epoch": 2.255363639273255, + "grad_norm": 0.1947673036544335, + "learning_rate": 1.717135215577772e-05, + "loss": 0.3062, + "num_tokens": 1260918033.0, + "step": 1653 + }, + { + "epoch": 2.2567292202319353, + "grad_norm": 0.21048709810491506, + "learning_rate": 1.7167733020473002e-05, + "loss": 0.3134, + "num_tokens": 1261661298.0, + "step": 1654 + }, + { + "epoch": 2.258094801190616, + "grad_norm": 0.20406374159999827, + "learning_rate": 1.7164112003660046e-05, + "loss": 0.3162, + "num_tokens": 1262460817.0, + "step": 1655 + }, + { + "epoch": 2.2594603821492965, + "grad_norm": 0.19574851695131437, + "learning_rate": 1.716048910644346e-05, + "loss": 0.3195, + "num_tokens": 1263196208.0, + "step": 1656 + }, + { + "epoch": 2.260825963107977, + "grad_norm": 0.20838718341554965, + "learning_rate": 1.7156864329928437e-05, + "loss": 0.293, + "num_tokens": 1263958686.0, + "step": 1657 + }, + { + "epoch": 2.2621915440666576, + "grad_norm": 0.18974749214960016, + "learning_rate": 1.715323767522074e-05, + "loss": 0.3031, + "num_tokens": 1264716705.0, + "step": 1658 + }, + { + "epoch": 2.263557125025338, + "grad_norm": 0.19636076715119866, + "learning_rate": 1.7149609143426697e-05, + "loss": 0.3065, + "num_tokens": 1265448868.0, + "step": 1659 + }, + { + "epoch": 2.2649227059840182, + "grad_norm": 0.2201382594976657, + "learning_rate": 1.7145978735653225e-05, + "loss": 0.3029, + "num_tokens": 1266217065.0, + "step": 1660 + }, + { + "epoch": 2.266288286942699, + "grad_norm": 0.20426313353815884, + "learning_rate": 1.7142346453007795e-05, + "loss": 0.3021, + "num_tokens": 1267028442.0, + "step": 1661 + }, + { + "epoch": 2.2676538679013793, + "grad_norm": 0.20462336590489053, + "learning_rate": 1.713871229659847e-05, + "loss": 0.3008, + "num_tokens": 1267765424.0, + "step": 1662 + }, + { + "epoch": 2.26901944886006, + "grad_norm": 0.2117771807775812, + "learning_rate": 1.713507626753387e-05, + "loss": 0.2936, + "num_tokens": 1268514972.0, + "step": 1663 + }, + { + "epoch": 2.2703850298187405, + "grad_norm": 0.19396276361020615, + "learning_rate": 1.7131438366923184e-05, + "loss": 0.2952, + "num_tokens": 1269174574.0, + "step": 1664 + }, + { + "epoch": 2.271750610777421, + "grad_norm": 0.21589202532287316, + "learning_rate": 1.7127798595876183e-05, + "loss": 0.3061, + "num_tokens": 1269979292.0, + "step": 1665 + }, + { + "epoch": 2.2731161917361016, + "grad_norm": 0.19061958731902487, + "learning_rate": 1.7124156955503208e-05, + "loss": 0.3057, + "num_tokens": 1270739608.0, + "step": 1666 + }, + { + "epoch": 2.274481772694782, + "grad_norm": 0.20423152615671686, + "learning_rate": 1.7120513446915154e-05, + "loss": 0.2958, + "num_tokens": 1271543309.0, + "step": 1667 + }, + { + "epoch": 2.2758473536534627, + "grad_norm": 0.19432124546171314, + "learning_rate": 1.711686807122351e-05, + "loss": 0.2992, + "num_tokens": 1272295795.0, + "step": 1668 + }, + { + "epoch": 2.277212934612143, + "grad_norm": 0.1879319707203541, + "learning_rate": 1.7113220829540313e-05, + "loss": 0.3125, + "num_tokens": 1273172358.0, + "step": 1669 + }, + { + "epoch": 2.2785785155708234, + "grad_norm": 0.19337251202305897, + "learning_rate": 1.710957172297818e-05, + "loss": 0.3083, + "num_tokens": 1273891520.0, + "step": 1670 + }, + { + "epoch": 2.279944096529504, + "grad_norm": 0.18215176873970268, + "learning_rate": 1.71059207526503e-05, + "loss": 0.295, + "num_tokens": 1274649585.0, + "step": 1671 + }, + { + "epoch": 2.2813096774881845, + "grad_norm": 0.20089091204377357, + "learning_rate": 1.710226791967042e-05, + "loss": 0.3073, + "num_tokens": 1275330787.0, + "step": 1672 + }, + { + "epoch": 2.282675258446865, + "grad_norm": 0.18483141087785823, + "learning_rate": 1.709861322515287e-05, + "loss": 0.3197, + "num_tokens": 1276185055.0, + "step": 1673 + }, + { + "epoch": 2.2840408394055456, + "grad_norm": 0.20418536068109885, + "learning_rate": 1.7094956670212527e-05, + "loss": 0.3136, + "num_tokens": 1276967152.0, + "step": 1674 + }, + { + "epoch": 2.285406420364226, + "grad_norm": 0.18547875604168182, + "learning_rate": 1.709129825596486e-05, + "loss": 0.3085, + "num_tokens": 1277682487.0, + "step": 1675 + }, + { + "epoch": 2.2867720013229067, + "grad_norm": 0.21142835394399176, + "learning_rate": 1.7087637983525883e-05, + "loss": 0.2947, + "num_tokens": 1278525929.0, + "step": 1676 + }, + { + "epoch": 2.2881375822815873, + "grad_norm": 0.1770282166532555, + "learning_rate": 1.7083975854012193e-05, + "loss": 0.3118, + "num_tokens": 1279252968.0, + "step": 1677 + }, + { + "epoch": 2.2895031632402674, + "grad_norm": 0.19414659637115766, + "learning_rate": 1.7080311868540943e-05, + "loss": 0.3057, + "num_tokens": 1280017395.0, + "step": 1678 + }, + { + "epoch": 2.290868744198948, + "grad_norm": 0.18580560648037378, + "learning_rate": 1.7076646028229857e-05, + "loss": 0.3071, + "num_tokens": 1280757734.0, + "step": 1679 + }, + { + "epoch": 2.2922343251576285, + "grad_norm": 0.1966637819659975, + "learning_rate": 1.707297833419723e-05, + "loss": 0.3106, + "num_tokens": 1281551240.0, + "step": 1680 + }, + { + "epoch": 2.293599906116309, + "grad_norm": 0.20607549541357992, + "learning_rate": 1.7069308787561905e-05, + "loss": 0.3025, + "num_tokens": 1282370339.0, + "step": 1681 + }, + { + "epoch": 2.2949654870749896, + "grad_norm": 0.1713230446632435, + "learning_rate": 1.7065637389443312e-05, + "loss": 0.3155, + "num_tokens": 1283142666.0, + "step": 1682 + }, + { + "epoch": 2.29633106803367, + "grad_norm": 0.19393003440203835, + "learning_rate": 1.7061964140961433e-05, + "loss": 0.3016, + "num_tokens": 1283883040.0, + "step": 1683 + }, + { + "epoch": 2.2976966489923507, + "grad_norm": 0.20712994876831672, + "learning_rate": 1.7058289043236813e-05, + "loss": 0.3055, + "num_tokens": 1284677952.0, + "step": 1684 + }, + { + "epoch": 2.2990622299510313, + "grad_norm": 0.16819672682883605, + "learning_rate": 1.7054612097390568e-05, + "loss": 0.2974, + "num_tokens": 1285506489.0, + "step": 1685 + }, + { + "epoch": 2.300427810909712, + "grad_norm": 0.20247322645888335, + "learning_rate": 1.7050933304544377e-05, + "loss": 0.3081, + "num_tokens": 1286221804.0, + "step": 1686 + }, + { + "epoch": 2.301793391868392, + "grad_norm": 0.19824404538190513, + "learning_rate": 1.7047252665820478e-05, + "loss": 0.3032, + "num_tokens": 1286988171.0, + "step": 1687 + }, + { + "epoch": 2.3031589728270725, + "grad_norm": 0.20330454770350592, + "learning_rate": 1.7043570182341672e-05, + "loss": 0.3045, + "num_tokens": 1287741158.0, + "step": 1688 + }, + { + "epoch": 2.304524553785753, + "grad_norm": 0.20054945726315188, + "learning_rate": 1.7039885855231333e-05, + "loss": 0.3154, + "num_tokens": 1288566326.0, + "step": 1689 + }, + { + "epoch": 2.3058901347444336, + "grad_norm": 0.19127437395324873, + "learning_rate": 1.7036199685613382e-05, + "loss": 0.3151, + "num_tokens": 1289324608.0, + "step": 1690 + }, + { + "epoch": 2.307255715703114, + "grad_norm": 0.20730860967030984, + "learning_rate": 1.7032511674612313e-05, + "loss": 0.3129, + "num_tokens": 1290115930.0, + "step": 1691 + }, + { + "epoch": 2.3086212966617947, + "grad_norm": 0.198772882149762, + "learning_rate": 1.7028821823353175e-05, + "loss": 0.3026, + "num_tokens": 1290858071.0, + "step": 1692 + }, + { + "epoch": 2.3099868776204753, + "grad_norm": 0.21079257433506593, + "learning_rate": 1.702513013296159e-05, + "loss": 0.3015, + "num_tokens": 1291671597.0, + "step": 1693 + }, + { + "epoch": 2.311352458579156, + "grad_norm": 0.18442749390224808, + "learning_rate": 1.7021436604563723e-05, + "loss": 0.2976, + "num_tokens": 1292397134.0, + "step": 1694 + }, + { + "epoch": 2.3127180395378364, + "grad_norm": 0.19816470087034735, + "learning_rate": 1.701774123928632e-05, + "loss": 0.3099, + "num_tokens": 1293140104.0, + "step": 1695 + }, + { + "epoch": 2.3140836204965165, + "grad_norm": 0.20234045606228304, + "learning_rate": 1.7014044038256664e-05, + "loss": 0.3095, + "num_tokens": 1293863467.0, + "step": 1696 + }, + { + "epoch": 2.315449201455197, + "grad_norm": 0.19484575719690514, + "learning_rate": 1.7010345002602622e-05, + "loss": 0.2923, + "num_tokens": 1294677183.0, + "step": 1697 + }, + { + "epoch": 2.3168147824138776, + "grad_norm": 0.193949845702339, + "learning_rate": 1.7006644133452607e-05, + "loss": 0.3011, + "num_tokens": 1295434715.0, + "step": 1698 + }, + { + "epoch": 2.318180363372558, + "grad_norm": 0.19449065850452904, + "learning_rate": 1.700294143193559e-05, + "loss": 0.2929, + "num_tokens": 1296182884.0, + "step": 1699 + }, + { + "epoch": 2.3195459443312387, + "grad_norm": 0.19649819909504884, + "learning_rate": 1.699923689918111e-05, + "loss": 0.3062, + "num_tokens": 1296858812.0, + "step": 1700 + }, + { + "epoch": 2.3209115252899193, + "grad_norm": 0.21319153109177408, + "learning_rate": 1.6995530536319255e-05, + "loss": 0.3084, + "num_tokens": 1297653607.0, + "step": 1701 + }, + { + "epoch": 2.3222771062486, + "grad_norm": 0.19579320681527534, + "learning_rate": 1.6991822344480675e-05, + "loss": 0.31, + "num_tokens": 1298372030.0, + "step": 1702 + }, + { + "epoch": 2.3236426872072804, + "grad_norm": 0.19163136938177616, + "learning_rate": 1.6988112324796586e-05, + "loss": 0.316, + "num_tokens": 1299189694.0, + "step": 1703 + }, + { + "epoch": 2.325008268165961, + "grad_norm": 0.21259968765607098, + "learning_rate": 1.6984400478398746e-05, + "loss": 0.3149, + "num_tokens": 1299875638.0, + "step": 1704 + }, + { + "epoch": 2.326373849124641, + "grad_norm": 0.20177014424860873, + "learning_rate": 1.698068680641949e-05, + "loss": 0.3059, + "num_tokens": 1300720094.0, + "step": 1705 + }, + { + "epoch": 2.3277394300833216, + "grad_norm": 0.19593054888954822, + "learning_rate": 1.697697130999168e-05, + "loss": 0.3231, + "num_tokens": 1301507197.0, + "step": 1706 + }, + { + "epoch": 2.329105011042002, + "grad_norm": 0.21356463601493972, + "learning_rate": 1.6973253990248774e-05, + "loss": 0.3056, + "num_tokens": 1302222817.0, + "step": 1707 + }, + { + "epoch": 2.3304705920006827, + "grad_norm": 0.195159164538878, + "learning_rate": 1.6969534848324747e-05, + "loss": 0.3037, + "num_tokens": 1302923451.0, + "step": 1708 + }, + { + "epoch": 2.3318361729593633, + "grad_norm": 0.206143828763009, + "learning_rate": 1.6965813885354163e-05, + "loss": 0.3041, + "num_tokens": 1303644790.0, + "step": 1709 + }, + { + "epoch": 2.333201753918044, + "grad_norm": 0.20273519820856614, + "learning_rate": 1.6962091102472115e-05, + "loss": 0.316, + "num_tokens": 1304429905.0, + "step": 1710 + }, + { + "epoch": 2.3345673348767244, + "grad_norm": 0.1880606890411883, + "learning_rate": 1.6958366500814268e-05, + "loss": 0.3021, + "num_tokens": 1305203401.0, + "step": 1711 + }, + { + "epoch": 2.335932915835405, + "grad_norm": 0.20266948444705415, + "learning_rate": 1.695464008151684e-05, + "loss": 0.296, + "num_tokens": 1305941214.0, + "step": 1712 + }, + { + "epoch": 2.3372984967940855, + "grad_norm": 0.18955383428814435, + "learning_rate": 1.6950911845716594e-05, + "loss": 0.2993, + "num_tokens": 1306690777.0, + "step": 1713 + }, + { + "epoch": 2.3386640777527656, + "grad_norm": 0.19933509720989648, + "learning_rate": 1.6947181794550858e-05, + "loss": 0.317, + "num_tokens": 1307446705.0, + "step": 1714 + }, + { + "epoch": 2.340029658711446, + "grad_norm": 0.19156201042856566, + "learning_rate": 1.6943449929157506e-05, + "loss": 0.3023, + "num_tokens": 1308219842.0, + "step": 1715 + }, + { + "epoch": 2.3413952396701267, + "grad_norm": 0.20520868457329722, + "learning_rate": 1.6939716250674973e-05, + "loss": 0.3117, + "num_tokens": 1308912931.0, + "step": 1716 + }, + { + "epoch": 2.3427608206288073, + "grad_norm": 0.1992790805644641, + "learning_rate": 1.6935980760242235e-05, + "loss": 0.3133, + "num_tokens": 1309677959.0, + "step": 1717 + }, + { + "epoch": 2.344126401587488, + "grad_norm": 0.19540549884978065, + "learning_rate": 1.693224345899883e-05, + "loss": 0.3048, + "num_tokens": 1310405001.0, + "step": 1718 + }, + { + "epoch": 2.3454919825461684, + "grad_norm": 0.21243786664041073, + "learning_rate": 1.6928504348084852e-05, + "loss": 0.302, + "num_tokens": 1311159861.0, + "step": 1719 + }, + { + "epoch": 2.346857563504849, + "grad_norm": 0.20219654612213925, + "learning_rate": 1.692476342864094e-05, + "loss": 0.2964, + "num_tokens": 1311827091.0, + "step": 1720 + }, + { + "epoch": 2.3482231444635295, + "grad_norm": 0.19156045407198316, + "learning_rate": 1.6921020701808285e-05, + "loss": 0.3126, + "num_tokens": 1312588648.0, + "step": 1721 + }, + { + "epoch": 2.34958872542221, + "grad_norm": 0.21103019430205142, + "learning_rate": 1.6917276168728634e-05, + "loss": 0.3001, + "num_tokens": 1313300228.0, + "step": 1722 + }, + { + "epoch": 2.35095430638089, + "grad_norm": 0.19983071401172664, + "learning_rate": 1.6913529830544275e-05, + "loss": 0.2935, + "num_tokens": 1313981114.0, + "step": 1723 + }, + { + "epoch": 2.3523198873395708, + "grad_norm": 0.254239789527125, + "learning_rate": 1.6909781688398055e-05, + "loss": 0.3094, + "num_tokens": 1314805436.0, + "step": 1724 + }, + { + "epoch": 2.3536854682982513, + "grad_norm": 0.1781315176919803, + "learning_rate": 1.6906031743433383e-05, + "loss": 0.2986, + "num_tokens": 1315615689.0, + "step": 1725 + }, + { + "epoch": 2.355051049256932, + "grad_norm": 0.19337540572717118, + "learning_rate": 1.6902279996794188e-05, + "loss": 0.3061, + "num_tokens": 1316340770.0, + "step": 1726 + }, + { + "epoch": 2.3564166302156124, + "grad_norm": 0.20267254402251278, + "learning_rate": 1.6898526449624972e-05, + "loss": 0.3055, + "num_tokens": 1317031406.0, + "step": 1727 + }, + { + "epoch": 2.357782211174293, + "grad_norm": 0.17950527292320267, + "learning_rate": 1.689477110307078e-05, + "loss": 0.2986, + "num_tokens": 1317733282.0, + "step": 1728 + }, + { + "epoch": 2.3591477921329735, + "grad_norm": 0.19269285476691655, + "learning_rate": 1.6891013958277207e-05, + "loss": 0.2971, + "num_tokens": 1318545253.0, + "step": 1729 + }, + { + "epoch": 2.360513373091654, + "grad_norm": 0.19679638846635478, + "learning_rate": 1.6887255016390396e-05, + "loss": 0.3083, + "num_tokens": 1319269525.0, + "step": 1730 + }, + { + "epoch": 2.3618789540503347, + "grad_norm": 0.17335539450662207, + "learning_rate": 1.688349427855703e-05, + "loss": 0.2954, + "num_tokens": 1320073829.0, + "step": 1731 + }, + { + "epoch": 2.3632445350090148, + "grad_norm": 0.19406575782307683, + "learning_rate": 1.6879731745924354e-05, + "loss": 0.2931, + "num_tokens": 1320819598.0, + "step": 1732 + }, + { + "epoch": 2.3646101159676953, + "grad_norm": 0.18189585689578874, + "learning_rate": 1.6875967419640148e-05, + "loss": 0.3119, + "num_tokens": 1321513625.0, + "step": 1733 + }, + { + "epoch": 2.365975696926376, + "grad_norm": 0.18590180422552516, + "learning_rate": 1.6872201300852754e-05, + "loss": 0.3172, + "num_tokens": 1322301532.0, + "step": 1734 + }, + { + "epoch": 2.3673412778850564, + "grad_norm": 0.18158927744954478, + "learning_rate": 1.6868433390711046e-05, + "loss": 0.3144, + "num_tokens": 1323133432.0, + "step": 1735 + }, + { + "epoch": 2.368706858843737, + "grad_norm": 0.2110643341548387, + "learning_rate": 1.6864663690364453e-05, + "loss": 0.2924, + "num_tokens": 1323861399.0, + "step": 1736 + }, + { + "epoch": 2.3700724398024176, + "grad_norm": 0.18475461931198495, + "learning_rate": 1.686089220096294e-05, + "loss": 0.3035, + "num_tokens": 1324562252.0, + "step": 1737 + }, + { + "epoch": 2.371438020761098, + "grad_norm": 0.19228307601426683, + "learning_rate": 1.685711892365703e-05, + "loss": 0.3159, + "num_tokens": 1325352214.0, + "step": 1738 + }, + { + "epoch": 2.3728036017197787, + "grad_norm": 0.1964553078434741, + "learning_rate": 1.6853343859597788e-05, + "loss": 0.3015, + "num_tokens": 1326106083.0, + "step": 1739 + }, + { + "epoch": 2.3741691826784592, + "grad_norm": 0.1824549810438216, + "learning_rate": 1.684956700993682e-05, + "loss": 0.2959, + "num_tokens": 1326884407.0, + "step": 1740 + }, + { + "epoch": 2.3755347636371393, + "grad_norm": 0.1910958164305454, + "learning_rate": 1.684578837582628e-05, + "loss": 0.3087, + "num_tokens": 1327620400.0, + "step": 1741 + }, + { + "epoch": 2.37690034459582, + "grad_norm": 0.2069122557169906, + "learning_rate": 1.6842007958418872e-05, + "loss": 0.3102, + "num_tokens": 1328369028.0, + "step": 1742 + }, + { + "epoch": 2.3782659255545004, + "grad_norm": 0.19050795765191345, + "learning_rate": 1.6838225758867825e-05, + "loss": 0.2933, + "num_tokens": 1329110107.0, + "step": 1743 + }, + { + "epoch": 2.379631506513181, + "grad_norm": 0.1853091838143724, + "learning_rate": 1.6834441778326935e-05, + "loss": 0.2927, + "num_tokens": 1329864602.0, + "step": 1744 + }, + { + "epoch": 2.3809970874718616, + "grad_norm": 0.18559572262201543, + "learning_rate": 1.683065601795053e-05, + "loss": 0.3059, + "num_tokens": 1330707441.0, + "step": 1745 + }, + { + "epoch": 2.382362668430542, + "grad_norm": 0.18644185176733416, + "learning_rate": 1.682686847889347e-05, + "loss": 0.2984, + "num_tokens": 1331401316.0, + "step": 1746 + }, + { + "epoch": 2.3837282493892227, + "grad_norm": 0.19261806192168815, + "learning_rate": 1.682307916231118e-05, + "loss": 0.3082, + "num_tokens": 1332189804.0, + "step": 1747 + }, + { + "epoch": 2.3850938303479032, + "grad_norm": 0.20889617181108358, + "learning_rate": 1.6819288069359615e-05, + "loss": 0.2989, + "num_tokens": 1332847679.0, + "step": 1748 + }, + { + "epoch": 2.386459411306584, + "grad_norm": 0.19758380858618335, + "learning_rate": 1.6815495201195273e-05, + "loss": 0.3174, + "num_tokens": 1333603702.0, + "step": 1749 + }, + { + "epoch": 2.387824992265264, + "grad_norm": 0.1919446630613184, + "learning_rate": 1.681170055897519e-05, + "loss": 0.3043, + "num_tokens": 1334412987.0, + "step": 1750 + }, + { + "epoch": 2.3891905732239445, + "grad_norm": 0.19740319769283052, + "learning_rate": 1.680790414385695e-05, + "loss": 0.315, + "num_tokens": 1335178156.0, + "step": 1751 + }, + { + "epoch": 2.390556154182625, + "grad_norm": 0.20044992737745712, + "learning_rate": 1.680410595699868e-05, + "loss": 0.3083, + "num_tokens": 1335961601.0, + "step": 1752 + }, + { + "epoch": 2.3919217351413056, + "grad_norm": 0.21296519076299583, + "learning_rate": 1.6800305999559033e-05, + "loss": 0.2972, + "num_tokens": 1336703067.0, + "step": 1753 + }, + { + "epoch": 2.393287316099986, + "grad_norm": 0.19977033490623092, + "learning_rate": 1.6796504272697214e-05, + "loss": 0.3062, + "num_tokens": 1337459233.0, + "step": 1754 + }, + { + "epoch": 2.3946528970586667, + "grad_norm": 0.19139425118068465, + "learning_rate": 1.679270077757297e-05, + "loss": 0.3108, + "num_tokens": 1338191424.0, + "step": 1755 + }, + { + "epoch": 2.3960184780173472, + "grad_norm": 0.19191867991771133, + "learning_rate": 1.6788895515346576e-05, + "loss": 0.3151, + "num_tokens": 1338961879.0, + "step": 1756 + }, + { + "epoch": 2.397384058976028, + "grad_norm": 0.18356033301755798, + "learning_rate": 1.6785088487178857e-05, + "loss": 0.3204, + "num_tokens": 1339773054.0, + "step": 1757 + }, + { + "epoch": 2.3987496399347084, + "grad_norm": 0.21797916226012926, + "learning_rate": 1.678127969423117e-05, + "loss": 0.3089, + "num_tokens": 1340539734.0, + "step": 1758 + }, + { + "epoch": 2.4001152208933885, + "grad_norm": 0.18355068197129334, + "learning_rate": 1.6777469137665414e-05, + "loss": 0.3223, + "num_tokens": 1341291107.0, + "step": 1759 + }, + { + "epoch": 2.401480801852069, + "grad_norm": 0.3246926804687585, + "learning_rate": 1.677365681864403e-05, + "loss": 0.3139, + "num_tokens": 1342082237.0, + "step": 1760 + }, + { + "epoch": 2.4028463828107496, + "grad_norm": 0.19947208310684872, + "learning_rate": 1.6769842738329982e-05, + "loss": 0.3044, + "num_tokens": 1342771301.0, + "step": 1761 + }, + { + "epoch": 2.40421196376943, + "grad_norm": 0.19274663663254638, + "learning_rate": 1.6766026897886786e-05, + "loss": 0.3092, + "num_tokens": 1343585949.0, + "step": 1762 + }, + { + "epoch": 2.4055775447281107, + "grad_norm": 0.1938610460831104, + "learning_rate": 1.676220929847849e-05, + "loss": 0.3205, + "num_tokens": 1344453263.0, + "step": 1763 + }, + { + "epoch": 2.4069431256867913, + "grad_norm": 0.2016961687001354, + "learning_rate": 1.6758389941269678e-05, + "loss": 0.3177, + "num_tokens": 1345189836.0, + "step": 1764 + }, + { + "epoch": 2.408308706645472, + "grad_norm": 0.2112939565435269, + "learning_rate": 1.675456882742547e-05, + "loss": 0.3061, + "num_tokens": 1345935629.0, + "step": 1765 + }, + { + "epoch": 2.4096742876041524, + "grad_norm": 0.20541956899395342, + "learning_rate": 1.6750745958111518e-05, + "loss": 0.323, + "num_tokens": 1346704479.0, + "step": 1766 + }, + { + "epoch": 2.411039868562833, + "grad_norm": 0.21735762835936648, + "learning_rate": 1.674692133449402e-05, + "loss": 0.3149, + "num_tokens": 1347589680.0, + "step": 1767 + }, + { + "epoch": 2.412405449521513, + "grad_norm": 0.2037819349926451, + "learning_rate": 1.6743094957739702e-05, + "loss": 0.2901, + "num_tokens": 1348338987.0, + "step": 1768 + }, + { + "epoch": 2.4137710304801936, + "grad_norm": 0.19199746126601053, + "learning_rate": 1.6739266829015822e-05, + "loss": 0.3054, + "num_tokens": 1349106948.0, + "step": 1769 + }, + { + "epoch": 2.415136611438874, + "grad_norm": 0.20104827265082742, + "learning_rate": 1.673543694949018e-05, + "loss": 0.3025, + "num_tokens": 1349835026.0, + "step": 1770 + }, + { + "epoch": 2.4165021923975547, + "grad_norm": 0.20358652064853344, + "learning_rate": 1.6731605320331104e-05, + "loss": 0.3078, + "num_tokens": 1350601070.0, + "step": 1771 + }, + { + "epoch": 2.4178677733562353, + "grad_norm": 0.18925895869657822, + "learning_rate": 1.6727771942707463e-05, + "loss": 0.3084, + "num_tokens": 1351296728.0, + "step": 1772 + }, + { + "epoch": 2.419233354314916, + "grad_norm": 0.20301310859088148, + "learning_rate": 1.672393681778865e-05, + "loss": 0.3112, + "num_tokens": 1352072637.0, + "step": 1773 + }, + { + "epoch": 2.4205989352735964, + "grad_norm": 0.18532830854899202, + "learning_rate": 1.6720099946744595e-05, + "loss": 0.3184, + "num_tokens": 1352887029.0, + "step": 1774 + }, + { + "epoch": 2.421964516232277, + "grad_norm": 0.19253204620778372, + "learning_rate": 1.6716261330745764e-05, + "loss": 0.3052, + "num_tokens": 1353583699.0, + "step": 1775 + }, + { + "epoch": 2.4233300971909575, + "grad_norm": 0.1948269589186321, + "learning_rate": 1.6712420970963152e-05, + "loss": 0.3107, + "num_tokens": 1354351110.0, + "step": 1776 + }, + { + "epoch": 2.4246956781496376, + "grad_norm": 0.197498572970463, + "learning_rate": 1.6708578868568287e-05, + "loss": 0.3164, + "num_tokens": 1355114870.0, + "step": 1777 + }, + { + "epoch": 2.426061259108318, + "grad_norm": 0.19103693468603314, + "learning_rate": 1.6704735024733222e-05, + "loss": 0.3036, + "num_tokens": 1355839648.0, + "step": 1778 + }, + { + "epoch": 2.4274268400669987, + "grad_norm": 0.1971678846172706, + "learning_rate": 1.6700889440630552e-05, + "loss": 0.299, + "num_tokens": 1356605855.0, + "step": 1779 + }, + { + "epoch": 2.4287924210256793, + "grad_norm": 0.1879840678003008, + "learning_rate": 1.66970421174334e-05, + "loss": 0.3189, + "num_tokens": 1357427989.0, + "step": 1780 + }, + { + "epoch": 2.43015800198436, + "grad_norm": 0.1827175412572702, + "learning_rate": 1.6693193056315418e-05, + "loss": 0.3064, + "num_tokens": 1358245928.0, + "step": 1781 + }, + { + "epoch": 2.4315235829430404, + "grad_norm": 0.18628506083304214, + "learning_rate": 1.6689342258450784e-05, + "loss": 0.3171, + "num_tokens": 1359027679.0, + "step": 1782 + }, + { + "epoch": 2.432889163901721, + "grad_norm": 0.20039822186728254, + "learning_rate": 1.6685489725014214e-05, + "loss": 0.2999, + "num_tokens": 1359814448.0, + "step": 1783 + }, + { + "epoch": 2.4342547448604015, + "grad_norm": 0.17300985386462137, + "learning_rate": 1.668163545718094e-05, + "loss": 0.292, + "num_tokens": 1360540315.0, + "step": 1784 + }, + { + "epoch": 2.435620325819082, + "grad_norm": 0.1874318926728766, + "learning_rate": 1.6677779456126745e-05, + "loss": 0.3096, + "num_tokens": 1361311581.0, + "step": 1785 + }, + { + "epoch": 2.436985906777762, + "grad_norm": 0.19525452372091123, + "learning_rate": 1.667392172302792e-05, + "loss": 0.3081, + "num_tokens": 1362128968.0, + "step": 1786 + }, + { + "epoch": 2.4383514877364427, + "grad_norm": 0.18579020509499333, + "learning_rate": 1.667006225906129e-05, + "loss": 0.2988, + "num_tokens": 1362844500.0, + "step": 1787 + }, + { + "epoch": 2.4397170686951233, + "grad_norm": 0.18689288857046346, + "learning_rate": 1.6666201065404217e-05, + "loss": 0.3155, + "num_tokens": 1363665788.0, + "step": 1788 + }, + { + "epoch": 2.441082649653804, + "grad_norm": 0.1829712948817836, + "learning_rate": 1.6662338143234585e-05, + "loss": 0.3144, + "num_tokens": 1364433671.0, + "step": 1789 + }, + { + "epoch": 2.4424482306124844, + "grad_norm": 0.1863865382545085, + "learning_rate": 1.6658473493730795e-05, + "loss": 0.3143, + "num_tokens": 1365282190.0, + "step": 1790 + }, + { + "epoch": 2.443813811571165, + "grad_norm": 0.1847382793163872, + "learning_rate": 1.6654607118071792e-05, + "loss": 0.3095, + "num_tokens": 1366045094.0, + "step": 1791 + }, + { + "epoch": 2.4451793925298455, + "grad_norm": 0.195986415804053, + "learning_rate": 1.665073901743704e-05, + "loss": 0.3059, + "num_tokens": 1366763997.0, + "step": 1792 + }, + { + "epoch": 2.446544973488526, + "grad_norm": 0.16838809805175659, + "learning_rate": 1.6646869193006523e-05, + "loss": 0.3125, + "num_tokens": 1367565978.0, + "step": 1793 + }, + { + "epoch": 2.4479105544472066, + "grad_norm": 0.1904158903869617, + "learning_rate": 1.6642997645960758e-05, + "loss": 0.3083, + "num_tokens": 1368338571.0, + "step": 1794 + }, + { + "epoch": 2.4492761354058867, + "grad_norm": 0.1698568891877568, + "learning_rate": 1.6639124377480796e-05, + "loss": 0.3017, + "num_tokens": 1369081718.0, + "step": 1795 + }, + { + "epoch": 2.4506417163645673, + "grad_norm": 0.19670303272643044, + "learning_rate": 1.663524938874819e-05, + "loss": 0.3215, + "num_tokens": 1369841051.0, + "step": 1796 + }, + { + "epoch": 2.452007297323248, + "grad_norm": 0.18042457437149495, + "learning_rate": 1.663137268094504e-05, + "loss": 0.2881, + "num_tokens": 1370607944.0, + "step": 1797 + }, + { + "epoch": 2.4533728782819284, + "grad_norm": 0.18056674729188132, + "learning_rate": 1.662749425525396e-05, + "loss": 0.3125, + "num_tokens": 1371402221.0, + "step": 1798 + }, + { + "epoch": 2.454738459240609, + "grad_norm": 0.1962706256278912, + "learning_rate": 1.6623614112858087e-05, + "loss": 0.3146, + "num_tokens": 1372189550.0, + "step": 1799 + }, + { + "epoch": 2.4561040401992895, + "grad_norm": 0.17048026269053626, + "learning_rate": 1.6619732254941085e-05, + "loss": 0.3074, + "num_tokens": 1372857974.0, + "step": 1800 + }, + { + "epoch": 2.45746962115797, + "grad_norm": 0.20183197013013823, + "learning_rate": 1.6615848682687145e-05, + "loss": 0.3134, + "num_tokens": 1373613075.0, + "step": 1801 + }, + { + "epoch": 2.4588352021166506, + "grad_norm": 0.18835061137348444, + "learning_rate": 1.661196339728097e-05, + "loss": 0.3024, + "num_tokens": 1374334754.0, + "step": 1802 + }, + { + "epoch": 2.460200783075331, + "grad_norm": 0.2055162165725593, + "learning_rate": 1.66080763999078e-05, + "loss": 0.2975, + "num_tokens": 1375035870.0, + "step": 1803 + }, + { + "epoch": 2.4615663640340113, + "grad_norm": 0.19394692473860006, + "learning_rate": 1.6604187691753384e-05, + "loss": 0.3042, + "num_tokens": 1375764284.0, + "step": 1804 + }, + { + "epoch": 2.462931944992692, + "grad_norm": 0.19879226689360469, + "learning_rate": 1.6600297274003997e-05, + "loss": 0.302, + "num_tokens": 1376508016.0, + "step": 1805 + }, + { + "epoch": 2.4642975259513724, + "grad_norm": 0.19463303761962258, + "learning_rate": 1.6596405147846443e-05, + "loss": 0.3061, + "num_tokens": 1377276417.0, + "step": 1806 + }, + { + "epoch": 2.465663106910053, + "grad_norm": 0.2045789489462508, + "learning_rate": 1.659251131446804e-05, + "loss": 0.3041, + "num_tokens": 1378034170.0, + "step": 1807 + }, + { + "epoch": 2.4670286878687335, + "grad_norm": 0.18055824558524616, + "learning_rate": 1.6588615775056624e-05, + "loss": 0.2892, + "num_tokens": 1378794263.0, + "step": 1808 + }, + { + "epoch": 2.468394268827414, + "grad_norm": 0.18571029888485197, + "learning_rate": 1.6584718530800555e-05, + "loss": 0.3125, + "num_tokens": 1379578390.0, + "step": 1809 + }, + { + "epoch": 2.4697598497860946, + "grad_norm": 0.1816227023493871, + "learning_rate": 1.6580819582888722e-05, + "loss": 0.3326, + "num_tokens": 1380352357.0, + "step": 1810 + }, + { + "epoch": 2.471125430744775, + "grad_norm": 0.19984743169792377, + "learning_rate": 1.6576918932510515e-05, + "loss": 0.2952, + "num_tokens": 1381028630.0, + "step": 1811 + }, + { + "epoch": 2.4724910117034558, + "grad_norm": 0.17787967566070575, + "learning_rate": 1.6573016580855855e-05, + "loss": 0.3109, + "num_tokens": 1381810142.0, + "step": 1812 + }, + { + "epoch": 2.473856592662136, + "grad_norm": 0.1985651401099988, + "learning_rate": 1.6569112529115183e-05, + "loss": 0.3044, + "num_tokens": 1382589992.0, + "step": 1813 + }, + { + "epoch": 2.4752221736208164, + "grad_norm": 0.17589406487444043, + "learning_rate": 1.6565206778479462e-05, + "loss": 0.2933, + "num_tokens": 1383367392.0, + "step": 1814 + }, + { + "epoch": 2.476587754579497, + "grad_norm": 0.17131589659778726, + "learning_rate": 1.656129933014016e-05, + "loss": 0.3062, + "num_tokens": 1384131046.0, + "step": 1815 + }, + { + "epoch": 2.4779533355381775, + "grad_norm": 0.2324413531802864, + "learning_rate": 1.6557390185289268e-05, + "loss": 0.3122, + "num_tokens": 1384845362.0, + "step": 1816 + }, + { + "epoch": 2.479318916496858, + "grad_norm": 0.19204721213490225, + "learning_rate": 1.6553479345119305e-05, + "loss": 0.3189, + "num_tokens": 1385634732.0, + "step": 1817 + }, + { + "epoch": 2.4806844974555387, + "grad_norm": 0.20179048559254273, + "learning_rate": 1.6549566810823287e-05, + "loss": 0.2934, + "num_tokens": 1386412843.0, + "step": 1818 + }, + { + "epoch": 2.482050078414219, + "grad_norm": 0.1871331448871095, + "learning_rate": 1.6545652583594772e-05, + "loss": 0.3052, + "num_tokens": 1387173246.0, + "step": 1819 + }, + { + "epoch": 2.4834156593728998, + "grad_norm": 0.20534237035076985, + "learning_rate": 1.6541736664627817e-05, + "loss": 0.3043, + "num_tokens": 1387923077.0, + "step": 1820 + }, + { + "epoch": 2.4847812403315803, + "grad_norm": 0.20554996223630848, + "learning_rate": 1.6537819055116994e-05, + "loss": 0.3297, + "num_tokens": 1388725455.0, + "step": 1821 + }, + { + "epoch": 2.4861468212902604, + "grad_norm": 0.18282115665065474, + "learning_rate": 1.6533899756257404e-05, + "loss": 0.2997, + "num_tokens": 1389420512.0, + "step": 1822 + }, + { + "epoch": 2.487512402248941, + "grad_norm": 0.20202467237289098, + "learning_rate": 1.652997876924465e-05, + "loss": 0.2954, + "num_tokens": 1390162276.0, + "step": 1823 + }, + { + "epoch": 2.4888779832076215, + "grad_norm": 0.1981734857114372, + "learning_rate": 1.6526056095274853e-05, + "loss": 0.3171, + "num_tokens": 1390922240.0, + "step": 1824 + }, + { + "epoch": 2.490243564166302, + "grad_norm": 0.17068367108001678, + "learning_rate": 1.652213173554466e-05, + "loss": 0.2966, + "num_tokens": 1391659533.0, + "step": 1825 + }, + { + "epoch": 2.4916091451249827, + "grad_norm": 0.18652342701217167, + "learning_rate": 1.6518205691251212e-05, + "loss": 0.2997, + "num_tokens": 1392344814.0, + "step": 1826 + }, + { + "epoch": 2.492974726083663, + "grad_norm": 0.20401469276490514, + "learning_rate": 1.651427796359218e-05, + "loss": 0.3049, + "num_tokens": 1393149909.0, + "step": 1827 + }, + { + "epoch": 2.4943403070423438, + "grad_norm": 0.18292508568943253, + "learning_rate": 1.6510348553765748e-05, + "loss": 0.301, + "num_tokens": 1393925870.0, + "step": 1828 + }, + { + "epoch": 2.4957058880010243, + "grad_norm": 0.20881322230180588, + "learning_rate": 1.650641746297061e-05, + "loss": 0.318, + "num_tokens": 1394673408.0, + "step": 1829 + }, + { + "epoch": 2.497071468959705, + "grad_norm": 0.20872864550418962, + "learning_rate": 1.6502484692405957e-05, + "loss": 0.2997, + "num_tokens": 1395469951.0, + "step": 1830 + }, + { + "epoch": 2.498437049918385, + "grad_norm": 0.18536995811359025, + "learning_rate": 1.6498550243271523e-05, + "loss": 0.2916, + "num_tokens": 1396201746.0, + "step": 1831 + }, + { + "epoch": 2.4998026308770656, + "grad_norm": 0.17566702509453128, + "learning_rate": 1.6494614116767528e-05, + "loss": 0.3197, + "num_tokens": 1397073615.0, + "step": 1832 + }, + { + "epoch": 2.501168211835746, + "grad_norm": 0.19893092607256022, + "learning_rate": 1.649067631409472e-05, + "loss": 0.3061, + "num_tokens": 1397791284.0, + "step": 1833 + }, + { + "epoch": 2.5025337927944267, + "grad_norm": 0.20588124501966967, + "learning_rate": 1.648673683645435e-05, + "loss": 0.2998, + "num_tokens": 1398530726.0, + "step": 1834 + }, + { + "epoch": 2.5038993737531072, + "grad_norm": 0.19157104109156048, + "learning_rate": 1.648279568504818e-05, + "loss": 0.3155, + "num_tokens": 1399211721.0, + "step": 1835 + }, + { + "epoch": 2.505264954711788, + "grad_norm": 0.18344758376238146, + "learning_rate": 1.6478852861078486e-05, + "loss": 0.3089, + "num_tokens": 1400006125.0, + "step": 1836 + }, + { + "epoch": 2.5066305356704683, + "grad_norm": 0.19432139542098834, + "learning_rate": 1.6474908365748053e-05, + "loss": 0.3037, + "num_tokens": 1400804425.0, + "step": 1837 + }, + { + "epoch": 2.507996116629149, + "grad_norm": 0.17058818606556356, + "learning_rate": 1.647096220026018e-05, + "loss": 0.3058, + "num_tokens": 1401534459.0, + "step": 1838 + }, + { + "epoch": 2.5093616975878295, + "grad_norm": 0.20373951502928148, + "learning_rate": 1.646701436581866e-05, + "loss": 0.3183, + "num_tokens": 1402304702.0, + "step": 1839 + }, + { + "epoch": 2.5107272785465096, + "grad_norm": 0.18631203801907367, + "learning_rate": 1.646306486362782e-05, + "loss": 0.3133, + "num_tokens": 1403089389.0, + "step": 1840 + }, + { + "epoch": 2.51209285950519, + "grad_norm": 0.19512317900676007, + "learning_rate": 1.645911369489247e-05, + "loss": 0.3114, + "num_tokens": 1403876738.0, + "step": 1841 + }, + { + "epoch": 2.5134584404638707, + "grad_norm": 0.19641619117064488, + "learning_rate": 1.645516086081795e-05, + "loss": 0.3177, + "num_tokens": 1404604983.0, + "step": 1842 + }, + { + "epoch": 2.5148240214225512, + "grad_norm": 0.18974211346526246, + "learning_rate": 1.645120636261009e-05, + "loss": 0.2915, + "num_tokens": 1405329463.0, + "step": 1843 + }, + { + "epoch": 2.516189602381232, + "grad_norm": 0.1832146511251332, + "learning_rate": 1.644725020147524e-05, + "loss": 0.2966, + "num_tokens": 1405990295.0, + "step": 1844 + }, + { + "epoch": 2.5175551833399124, + "grad_norm": 0.170318686947924, + "learning_rate": 1.644329237862026e-05, + "loss": 0.3065, + "num_tokens": 1406792919.0, + "step": 1845 + }, + { + "epoch": 2.518920764298593, + "grad_norm": 0.17598531674618775, + "learning_rate": 1.64393328952525e-05, + "loss": 0.3072, + "num_tokens": 1407608999.0, + "step": 1846 + }, + { + "epoch": 2.5202863452572735, + "grad_norm": 0.29972620089882906, + "learning_rate": 1.643537175257983e-05, + "loss": 0.3084, + "num_tokens": 1408395533.0, + "step": 1847 + }, + { + "epoch": 2.521651926215954, + "grad_norm": 0.1874873311247297, + "learning_rate": 1.6431408951810627e-05, + "loss": 0.3179, + "num_tokens": 1409107723.0, + "step": 1848 + }, + { + "epoch": 2.523017507174634, + "grad_norm": 0.18163896068235807, + "learning_rate": 1.6427444494153768e-05, + "loss": 0.3085, + "num_tokens": 1410008464.0, + "step": 1849 + }, + { + "epoch": 2.5243830881333147, + "grad_norm": 0.16357119503302645, + "learning_rate": 1.6423478380818633e-05, + "loss": 0.3196, + "num_tokens": 1410732552.0, + "step": 1850 + }, + { + "epoch": 2.5257486690919952, + "grad_norm": 0.2219804416061581, + "learning_rate": 1.6419510613015117e-05, + "loss": 0.3235, + "num_tokens": 1411446834.0, + "step": 1851 + }, + { + "epoch": 2.527114250050676, + "grad_norm": 0.1838395842260659, + "learning_rate": 1.641554119195361e-05, + "loss": 0.3078, + "num_tokens": 1412188644.0, + "step": 1852 + }, + { + "epoch": 2.5284798310093564, + "grad_norm": 0.1849320925698564, + "learning_rate": 1.6411570118845016e-05, + "loss": 0.3196, + "num_tokens": 1412963986.0, + "step": 1853 + }, + { + "epoch": 2.529845411968037, + "grad_norm": 0.1807209677052282, + "learning_rate": 1.6407597394900733e-05, + "loss": 0.2822, + "num_tokens": 1413668828.0, + "step": 1854 + }, + { + "epoch": 2.5312109929267175, + "grad_norm": 0.17741259345953658, + "learning_rate": 1.6403623021332667e-05, + "loss": 0.3237, + "num_tokens": 1414462053.0, + "step": 1855 + }, + { + "epoch": 2.532576573885398, + "grad_norm": 0.20456645635323153, + "learning_rate": 1.6399646999353226e-05, + "loss": 0.3008, + "num_tokens": 1415171077.0, + "step": 1856 + }, + { + "epoch": 2.5339421548440786, + "grad_norm": 0.19067755078085916, + "learning_rate": 1.6395669330175325e-05, + "loss": 0.2982, + "num_tokens": 1415939956.0, + "step": 1857 + }, + { + "epoch": 2.5353077358027587, + "grad_norm": 0.1937914314415248, + "learning_rate": 1.6391690015012382e-05, + "loss": 0.3023, + "num_tokens": 1416698042.0, + "step": 1858 + }, + { + "epoch": 2.5366733167614393, + "grad_norm": 0.17573899284033595, + "learning_rate": 1.6387709055078304e-05, + "loss": 0.3122, + "num_tokens": 1417580763.0, + "step": 1859 + }, + { + "epoch": 2.53803889772012, + "grad_norm": 0.1797478693160339, + "learning_rate": 1.638372645158752e-05, + "loss": 0.3079, + "num_tokens": 1418389376.0, + "step": 1860 + }, + { + "epoch": 2.5394044786788004, + "grad_norm": 0.2029951259386221, + "learning_rate": 1.6379742205754945e-05, + "loss": 0.3202, + "num_tokens": 1419112005.0, + "step": 1861 + }, + { + "epoch": 2.540770059637481, + "grad_norm": 0.19947710835425853, + "learning_rate": 1.6375756318795998e-05, + "loss": 0.2998, + "num_tokens": 1419933833.0, + "step": 1862 + }, + { + "epoch": 2.5421356405961615, + "grad_norm": 0.1880204093701435, + "learning_rate": 1.63717687919266e-05, + "loss": 0.316, + "num_tokens": 1420746843.0, + "step": 1863 + }, + { + "epoch": 2.543501221554842, + "grad_norm": 0.1950252142701813, + "learning_rate": 1.6367779626363177e-05, + "loss": 0.3133, + "num_tokens": 1421480601.0, + "step": 1864 + }, + { + "epoch": 2.5448668025135226, + "grad_norm": 0.19507473666637723, + "learning_rate": 1.636378882332265e-05, + "loss": 0.3068, + "num_tokens": 1422248596.0, + "step": 1865 + }, + { + "epoch": 2.546232383472203, + "grad_norm": 0.17775236401347858, + "learning_rate": 1.635979638402244e-05, + "loss": 0.3021, + "num_tokens": 1423045325.0, + "step": 1866 + }, + { + "epoch": 2.5475979644308833, + "grad_norm": 0.18357144788339586, + "learning_rate": 1.6355802309680466e-05, + "loss": 0.2986, + "num_tokens": 1423951072.0, + "step": 1867 + }, + { + "epoch": 2.548963545389564, + "grad_norm": 0.19744844581714022, + "learning_rate": 1.6351806601515143e-05, + "loss": 0.3005, + "num_tokens": 1424676843.0, + "step": 1868 + }, + { + "epoch": 2.5503291263482444, + "grad_norm": 0.17359389248451276, + "learning_rate": 1.6347809260745395e-05, + "loss": 0.3085, + "num_tokens": 1425512818.0, + "step": 1869 + }, + { + "epoch": 2.551694707306925, + "grad_norm": 0.19436015723823413, + "learning_rate": 1.634381028859064e-05, + "loss": 0.3152, + "num_tokens": 1426314016.0, + "step": 1870 + }, + { + "epoch": 2.5530602882656055, + "grad_norm": 0.17852789329580818, + "learning_rate": 1.633980968627078e-05, + "loss": 0.2978, + "num_tokens": 1427062453.0, + "step": 1871 + }, + { + "epoch": 2.554425869224286, + "grad_norm": 0.19432470082901612, + "learning_rate": 1.6335807455006238e-05, + "loss": 0.3133, + "num_tokens": 1427781104.0, + "step": 1872 + }, + { + "epoch": 2.5557914501829666, + "grad_norm": 0.18826108663964913, + "learning_rate": 1.633180359601791e-05, + "loss": 0.3026, + "num_tokens": 1428564427.0, + "step": 1873 + }, + { + "epoch": 2.557157031141647, + "grad_norm": 0.17584268908346987, + "learning_rate": 1.632779811052721e-05, + "loss": 0.298, + "num_tokens": 1429339515.0, + "step": 1874 + }, + { + "epoch": 2.5585226121003277, + "grad_norm": 0.18562188462648505, + "learning_rate": 1.632379099975603e-05, + "loss": 0.3138, + "num_tokens": 1430127707.0, + "step": 1875 + }, + { + "epoch": 2.559888193059008, + "grad_norm": 0.19128223659297722, + "learning_rate": 1.6319782264926773e-05, + "loss": 0.3073, + "num_tokens": 1430970867.0, + "step": 1876 + }, + { + "epoch": 2.5612537740176884, + "grad_norm": 0.20927619837329367, + "learning_rate": 1.6315771907262324e-05, + "loss": 0.3072, + "num_tokens": 1431685533.0, + "step": 1877 + }, + { + "epoch": 2.562619354976369, + "grad_norm": 0.20395475623053075, + "learning_rate": 1.6311759927986078e-05, + "loss": 0.3105, + "num_tokens": 1432420498.0, + "step": 1878 + }, + { + "epoch": 2.5639849359350495, + "grad_norm": 0.19273412904207954, + "learning_rate": 1.6307746328321906e-05, + "loss": 0.3041, + "num_tokens": 1433236123.0, + "step": 1879 + }, + { + "epoch": 2.56535051689373, + "grad_norm": 0.18296336822006742, + "learning_rate": 1.6303731109494193e-05, + "loss": 0.3137, + "num_tokens": 1434006479.0, + "step": 1880 + }, + { + "epoch": 2.5667160978524106, + "grad_norm": 0.19662674547777298, + "learning_rate": 1.6299714272727795e-05, + "loss": 0.3116, + "num_tokens": 1434769628.0, + "step": 1881 + }, + { + "epoch": 2.568081678811091, + "grad_norm": 0.1837085566266636, + "learning_rate": 1.6295695819248088e-05, + "loss": 0.3105, + "num_tokens": 1435552685.0, + "step": 1882 + }, + { + "epoch": 2.5694472597697717, + "grad_norm": 0.20523535664034614, + "learning_rate": 1.6291675750280928e-05, + "loss": 0.3056, + "num_tokens": 1436327722.0, + "step": 1883 + }, + { + "epoch": 2.5708128407284523, + "grad_norm": 0.18281419755374187, + "learning_rate": 1.6287654067052657e-05, + "loss": 0.3206, + "num_tokens": 1437057891.0, + "step": 1884 + }, + { + "epoch": 2.5721784216871324, + "grad_norm": 0.2091946783319462, + "learning_rate": 1.628363077079012e-05, + "loss": 0.3069, + "num_tokens": 1437803906.0, + "step": 1885 + }, + { + "epoch": 2.573544002645813, + "grad_norm": 0.18686103802582169, + "learning_rate": 1.6279605862720644e-05, + "loss": 0.3078, + "num_tokens": 1438553462.0, + "step": 1886 + }, + { + "epoch": 2.5749095836044935, + "grad_norm": 0.3100589973500473, + "learning_rate": 1.6275579344072064e-05, + "loss": 0.3122, + "num_tokens": 1439302157.0, + "step": 1887 + }, + { + "epoch": 2.576275164563174, + "grad_norm": 0.19386902737651387, + "learning_rate": 1.627155121607269e-05, + "loss": 0.3194, + "num_tokens": 1440092772.0, + "step": 1888 + }, + { + "epoch": 2.5776407455218546, + "grad_norm": 0.18458489216829368, + "learning_rate": 1.6267521479951332e-05, + "loss": 0.3039, + "num_tokens": 1440812008.0, + "step": 1889 + }, + { + "epoch": 2.579006326480535, + "grad_norm": 0.19543146636823708, + "learning_rate": 1.626349013693729e-05, + "loss": 0.2999, + "num_tokens": 1441576985.0, + "step": 1890 + }, + { + "epoch": 2.5803719074392157, + "grad_norm": 0.1792935100828067, + "learning_rate": 1.6259457188260345e-05, + "loss": 0.3066, + "num_tokens": 1442375915.0, + "step": 1891 + }, + { + "epoch": 2.5817374883978963, + "grad_norm": 0.1999498118318816, + "learning_rate": 1.6255422635150783e-05, + "loss": 0.2964, + "num_tokens": 1443115738.0, + "step": 1892 + }, + { + "epoch": 2.583103069356577, + "grad_norm": 0.1886253689580606, + "learning_rate": 1.6251386478839367e-05, + "loss": 0.3184, + "num_tokens": 1443886074.0, + "step": 1893 + }, + { + "epoch": 2.584468650315257, + "grad_norm": 0.1986148467145687, + "learning_rate": 1.6247348720557353e-05, + "loss": 0.3053, + "num_tokens": 1444634059.0, + "step": 1894 + }, + { + "epoch": 2.5858342312739375, + "grad_norm": 0.21183333487230394, + "learning_rate": 1.624330936153649e-05, + "loss": 0.3231, + "num_tokens": 1445454319.0, + "step": 1895 + }, + { + "epoch": 2.587199812232618, + "grad_norm": 0.20845393069253954, + "learning_rate": 1.623926840300901e-05, + "loss": 0.3189, + "num_tokens": 1446236930.0, + "step": 1896 + }, + { + "epoch": 2.5885653931912986, + "grad_norm": 0.196179598653693, + "learning_rate": 1.623522584620763e-05, + "loss": 0.3103, + "num_tokens": 1446990959.0, + "step": 1897 + }, + { + "epoch": 2.589930974149979, + "grad_norm": 0.18662345439417685, + "learning_rate": 1.6231181692365567e-05, + "loss": 0.3184, + "num_tokens": 1447721442.0, + "step": 1898 + }, + { + "epoch": 2.5912965551086597, + "grad_norm": 0.20716555157166938, + "learning_rate": 1.622713594271651e-05, + "loss": 0.314, + "num_tokens": 1448455544.0, + "step": 1899 + }, + { + "epoch": 2.5926621360673403, + "grad_norm": 0.21809703453251683, + "learning_rate": 1.6223088598494647e-05, + "loss": 0.3148, + "num_tokens": 1449272081.0, + "step": 1900 + }, + { + "epoch": 2.594027717026021, + "grad_norm": 0.1840729877335866, + "learning_rate": 1.6219039660934644e-05, + "loss": 0.3004, + "num_tokens": 1450028131.0, + "step": 1901 + }, + { + "epoch": 2.5953932979847014, + "grad_norm": 0.19623738461388052, + "learning_rate": 1.6214989131271658e-05, + "loss": 0.3134, + "num_tokens": 1450771886.0, + "step": 1902 + }, + { + "epoch": 2.5967588789433815, + "grad_norm": 0.1879133926088854, + "learning_rate": 1.621093701074133e-05, + "loss": 0.3125, + "num_tokens": 1451523813.0, + "step": 1903 + }, + { + "epoch": 2.598124459902062, + "grad_norm": 0.19222472056042555, + "learning_rate": 1.6206883300579785e-05, + "loss": 0.3068, + "num_tokens": 1452231344.0, + "step": 1904 + }, + { + "epoch": 2.5994900408607426, + "grad_norm": 0.19853856488387872, + "learning_rate": 1.6202828002023636e-05, + "loss": 0.3288, + "num_tokens": 1452958199.0, + "step": 1905 + }, + { + "epoch": 2.600855621819423, + "grad_norm": 0.1867461065792624, + "learning_rate": 1.6198771116309982e-05, + "loss": 0.3171, + "num_tokens": 1453778010.0, + "step": 1906 + }, + { + "epoch": 2.6022212027781038, + "grad_norm": 0.2060554783728174, + "learning_rate": 1.61947126446764e-05, + "loss": 0.2992, + "num_tokens": 1454491324.0, + "step": 1907 + }, + { + "epoch": 2.6035867837367843, + "grad_norm": 0.19019171691173425, + "learning_rate": 1.619065258836095e-05, + "loss": 0.3132, + "num_tokens": 1455341194.0, + "step": 1908 + }, + { + "epoch": 2.604952364695465, + "grad_norm": 0.18897337960743313, + "learning_rate": 1.6186590948602187e-05, + "loss": 0.3144, + "num_tokens": 1456106730.0, + "step": 1909 + }, + { + "epoch": 2.6063179456541454, + "grad_norm": 0.19118505121416798, + "learning_rate": 1.6182527726639133e-05, + "loss": 0.3066, + "num_tokens": 1456857613.0, + "step": 1910 + }, + { + "epoch": 2.607683526612826, + "grad_norm": 0.19816234019608836, + "learning_rate": 1.617846292371131e-05, + "loss": 0.301, + "num_tokens": 1457591432.0, + "step": 1911 + }, + { + "epoch": 2.609049107571506, + "grad_norm": 0.17305868073983566, + "learning_rate": 1.617439654105871e-05, + "loss": 0.3054, + "num_tokens": 1458368779.0, + "step": 1912 + }, + { + "epoch": 2.6104146885301867, + "grad_norm": 0.1892454202067193, + "learning_rate": 1.6170328579921805e-05, + "loss": 0.2979, + "num_tokens": 1459066112.0, + "step": 1913 + }, + { + "epoch": 2.611780269488867, + "grad_norm": 0.19960050602208107, + "learning_rate": 1.616625904154156e-05, + "loss": 0.3043, + "num_tokens": 1459791952.0, + "step": 1914 + }, + { + "epoch": 2.6131458504475478, + "grad_norm": 0.17256542768792485, + "learning_rate": 1.6162187927159415e-05, + "loss": 0.3047, + "num_tokens": 1460541770.0, + "step": 1915 + }, + { + "epoch": 2.6145114314062283, + "grad_norm": 0.2016512719915629, + "learning_rate": 1.615811523801729e-05, + "loss": 0.2989, + "num_tokens": 1461356324.0, + "step": 1916 + }, + { + "epoch": 2.615877012364909, + "grad_norm": 0.18149632154924267, + "learning_rate": 1.6154040975357583e-05, + "loss": 0.3206, + "num_tokens": 1462143492.0, + "step": 1917 + }, + { + "epoch": 2.6172425933235894, + "grad_norm": 0.2043413613363849, + "learning_rate": 1.6149965140423172e-05, + "loss": 0.307, + "num_tokens": 1462857213.0, + "step": 1918 + }, + { + "epoch": 2.61860817428227, + "grad_norm": 0.19133462665870038, + "learning_rate": 1.6145887734457432e-05, + "loss": 0.2925, + "num_tokens": 1463488127.0, + "step": 1919 + }, + { + "epoch": 2.6199737552409506, + "grad_norm": 0.19064953902465892, + "learning_rate": 1.6141808758704186e-05, + "loss": 0.3051, + "num_tokens": 1464297199.0, + "step": 1920 + }, + { + "epoch": 2.6213393361996307, + "grad_norm": 0.18652768438623393, + "learning_rate": 1.6137728214407766e-05, + "loss": 0.3094, + "num_tokens": 1465095846.0, + "step": 1921 + }, + { + "epoch": 2.622704917158311, + "grad_norm": 0.17920156391457287, + "learning_rate": 1.6133646102812957e-05, + "loss": 0.2996, + "num_tokens": 1465840495.0, + "step": 1922 + }, + { + "epoch": 2.624070498116992, + "grad_norm": 0.1861393213900466, + "learning_rate": 1.612956242516505e-05, + "loss": 0.3165, + "num_tokens": 1466624399.0, + "step": 1923 + }, + { + "epoch": 2.6254360790756723, + "grad_norm": 0.17824811323546286, + "learning_rate": 1.6125477182709784e-05, + "loss": 0.2999, + "num_tokens": 1467392149.0, + "step": 1924 + }, + { + "epoch": 2.626801660034353, + "grad_norm": 0.1870470948269081, + "learning_rate": 1.6121390376693394e-05, + "loss": 0.3094, + "num_tokens": 1468194714.0, + "step": 1925 + }, + { + "epoch": 2.6281672409930334, + "grad_norm": 0.18157683507707956, + "learning_rate": 1.611730200836259e-05, + "loss": 0.3009, + "num_tokens": 1468921710.0, + "step": 1926 + }, + { + "epoch": 2.629532821951714, + "grad_norm": 0.18630761475974564, + "learning_rate": 1.6113212078964558e-05, + "loss": 0.3011, + "num_tokens": 1469749002.0, + "step": 1927 + }, + { + "epoch": 2.6308984029103946, + "grad_norm": 0.17952364575357013, + "learning_rate": 1.6109120589746953e-05, + "loss": 0.307, + "num_tokens": 1470519522.0, + "step": 1928 + }, + { + "epoch": 2.632263983869075, + "grad_norm": 0.18688291640483587, + "learning_rate": 1.6105027541957914e-05, + "loss": 0.3059, + "num_tokens": 1471248108.0, + "step": 1929 + }, + { + "epoch": 2.6336295648277552, + "grad_norm": 0.18929248456484843, + "learning_rate": 1.610093293684605e-05, + "loss": 0.306, + "num_tokens": 1471937767.0, + "step": 1930 + }, + { + "epoch": 2.634995145786436, + "grad_norm": 0.19219341504176976, + "learning_rate": 1.609683677566045e-05, + "loss": 0.302, + "num_tokens": 1472682276.0, + "step": 1931 + }, + { + "epoch": 2.6363607267451163, + "grad_norm": 0.1857107976934036, + "learning_rate": 1.609273905965068e-05, + "loss": 0.3209, + "num_tokens": 1473508408.0, + "step": 1932 + }, + { + "epoch": 2.637726307703797, + "grad_norm": 0.17809504353981542, + "learning_rate": 1.6088639790066763e-05, + "loss": 0.2955, + "num_tokens": 1474299012.0, + "step": 1933 + }, + { + "epoch": 2.6390918886624775, + "grad_norm": 0.1829549186326802, + "learning_rate": 1.6084538968159217e-05, + "loss": 0.3062, + "num_tokens": 1475060852.0, + "step": 1934 + }, + { + "epoch": 2.640457469621158, + "grad_norm": 0.19444955920007664, + "learning_rate": 1.6080436595179028e-05, + "loss": 0.3096, + "num_tokens": 1475791789.0, + "step": 1935 + }, + { + "epoch": 2.6418230505798386, + "grad_norm": 0.21423100625359065, + "learning_rate": 1.607633267237765e-05, + "loss": 0.3314, + "num_tokens": 1476557808.0, + "step": 1936 + }, + { + "epoch": 2.643188631538519, + "grad_norm": 0.19808431640029442, + "learning_rate": 1.6072227201007006e-05, + "loss": 0.3118, + "num_tokens": 1477365856.0, + "step": 1937 + }, + { + "epoch": 2.6445542124971997, + "grad_norm": 0.2080328732708811, + "learning_rate": 1.6068120182319506e-05, + "loss": 0.3135, + "num_tokens": 1478097410.0, + "step": 1938 + }, + { + "epoch": 2.64591979345588, + "grad_norm": 0.20736411986193498, + "learning_rate": 1.606401161756802e-05, + "loss": 0.3111, + "num_tokens": 1478863190.0, + "step": 1939 + }, + { + "epoch": 2.6472853744145604, + "grad_norm": 0.18344376864227943, + "learning_rate": 1.6059901508005894e-05, + "loss": 0.3071, + "num_tokens": 1479639316.0, + "step": 1940 + }, + { + "epoch": 2.648650955373241, + "grad_norm": 0.19662723114659078, + "learning_rate": 1.605578985488694e-05, + "loss": 0.309, + "num_tokens": 1480382633.0, + "step": 1941 + }, + { + "epoch": 2.6500165363319215, + "grad_norm": 0.19881150561054786, + "learning_rate": 1.6051676659465448e-05, + "loss": 0.3182, + "num_tokens": 1481048833.0, + "step": 1942 + }, + { + "epoch": 2.651382117290602, + "grad_norm": 0.2068832721569244, + "learning_rate": 1.6047561922996182e-05, + "loss": 0.3031, + "num_tokens": 1481764793.0, + "step": 1943 + }, + { + "epoch": 2.6527476982492826, + "grad_norm": 0.18243429150212043, + "learning_rate": 1.604344564673436e-05, + "loss": 0.3094, + "num_tokens": 1482587905.0, + "step": 1944 + }, + { + "epoch": 2.654113279207963, + "grad_norm": 0.20391940777603845, + "learning_rate": 1.603932783193569e-05, + "loss": 0.316, + "num_tokens": 1483388092.0, + "step": 1945 + }, + { + "epoch": 2.6554788601666437, + "grad_norm": 0.17099419166209298, + "learning_rate": 1.603520847985633e-05, + "loss": 0.2873, + "num_tokens": 1484154604.0, + "step": 1946 + }, + { + "epoch": 2.6568444411253243, + "grad_norm": 0.20127554485215263, + "learning_rate": 1.603108759175292e-05, + "loss": 0.3091, + "num_tokens": 1484914329.0, + "step": 1947 + }, + { + "epoch": 2.6582100220840044, + "grad_norm": 0.19569596646733337, + "learning_rate": 1.602696516888256e-05, + "loss": 0.3139, + "num_tokens": 1485659032.0, + "step": 1948 + }, + { + "epoch": 2.659575603042685, + "grad_norm": 0.19617826876756203, + "learning_rate": 1.6022841212502827e-05, + "loss": 0.2918, + "num_tokens": 1486381380.0, + "step": 1949 + }, + { + "epoch": 2.6609411840013655, + "grad_norm": 0.18432775172164842, + "learning_rate": 1.6018715723871762e-05, + "loss": 0.3047, + "num_tokens": 1487124409.0, + "step": 1950 + }, + { + "epoch": 2.662306764960046, + "grad_norm": 0.19271933152536327, + "learning_rate": 1.6014588704247873e-05, + "loss": 0.2979, + "num_tokens": 1487912413.0, + "step": 1951 + }, + { + "epoch": 2.6636723459187266, + "grad_norm": 0.18750291905434413, + "learning_rate": 1.601046015489013e-05, + "loss": 0.3243, + "num_tokens": 1488734754.0, + "step": 1952 + }, + { + "epoch": 2.665037926877407, + "grad_norm": 0.18374213491461577, + "learning_rate": 1.6006330077057987e-05, + "loss": 0.3209, + "num_tokens": 1489469100.0, + "step": 1953 + }, + { + "epoch": 2.6664035078360877, + "grad_norm": 0.2111620229182135, + "learning_rate": 1.6002198472011334e-05, + "loss": 0.2935, + "num_tokens": 1490173238.0, + "step": 1954 + }, + { + "epoch": 2.6677690887947683, + "grad_norm": 0.18248682414773168, + "learning_rate": 1.599806534101056e-05, + "loss": 0.3142, + "num_tokens": 1490952911.0, + "step": 1955 + }, + { + "epoch": 2.669134669753449, + "grad_norm": 0.20642846112482732, + "learning_rate": 1.5993930685316494e-05, + "loss": 0.2945, + "num_tokens": 1491672850.0, + "step": 1956 + }, + { + "epoch": 2.670500250712129, + "grad_norm": 0.17578282459340613, + "learning_rate": 1.598979450619045e-05, + "loss": 0.3003, + "num_tokens": 1492470706.0, + "step": 1957 + }, + { + "epoch": 2.6718658316708095, + "grad_norm": 0.18370256512509872, + "learning_rate": 1.5985656804894184e-05, + "loss": 0.3152, + "num_tokens": 1493229132.0, + "step": 1958 + }, + { + "epoch": 2.67323141262949, + "grad_norm": 0.19876504186671745, + "learning_rate": 1.5981517582689942e-05, + "loss": 0.2947, + "num_tokens": 1493955181.0, + "step": 1959 + }, + { + "epoch": 2.6745969935881706, + "grad_norm": 0.17876139155361528, + "learning_rate": 1.5977376840840416e-05, + "loss": 0.3057, + "num_tokens": 1494681409.0, + "step": 1960 + }, + { + "epoch": 2.675962574546851, + "grad_norm": 0.1835883610232555, + "learning_rate": 1.5973234580608767e-05, + "loss": 0.2993, + "num_tokens": 1495394894.0, + "step": 1961 + }, + { + "epoch": 2.6773281555055317, + "grad_norm": 0.19065572206936024, + "learning_rate": 1.5969090803258622e-05, + "loss": 0.3069, + "num_tokens": 1496151766.0, + "step": 1962 + }, + { + "epoch": 2.6786937364642123, + "grad_norm": 0.18315305643961519, + "learning_rate": 1.5964945510054066e-05, + "loss": 0.3261, + "num_tokens": 1496898845.0, + "step": 1963 + }, + { + "epoch": 2.680059317422893, + "grad_norm": 0.19384936994517646, + "learning_rate": 1.5960798702259647e-05, + "loss": 0.3108, + "num_tokens": 1497706631.0, + "step": 1964 + }, + { + "epoch": 2.6814248983815734, + "grad_norm": 0.17357645797926635, + "learning_rate": 1.595665038114038e-05, + "loss": 0.3106, + "num_tokens": 1498541856.0, + "step": 1965 + }, + { + "epoch": 2.6827904793402535, + "grad_norm": 0.19670567504020592, + "learning_rate": 1.595250054796173e-05, + "loss": 0.3085, + "num_tokens": 1499322830.0, + "step": 1966 + }, + { + "epoch": 2.684156060298934, + "grad_norm": 0.17672932944213016, + "learning_rate": 1.5948349203989642e-05, + "loss": 0.3032, + "num_tokens": 1499995157.0, + "step": 1967 + }, + { + "epoch": 2.6855216412576146, + "grad_norm": 0.20611559699087803, + "learning_rate": 1.5944196350490505e-05, + "loss": 0.318, + "num_tokens": 1500816621.0, + "step": 1968 + }, + { + "epoch": 2.686887222216295, + "grad_norm": 0.17473627599306446, + "learning_rate": 1.594004198873118e-05, + "loss": 0.3128, + "num_tokens": 1501661704.0, + "step": 1969 + }, + { + "epoch": 2.6882528031749757, + "grad_norm": 0.1749271467227588, + "learning_rate": 1.5935886119978977e-05, + "loss": 0.306, + "num_tokens": 1502361364.0, + "step": 1970 + }, + { + "epoch": 2.6896183841336563, + "grad_norm": 0.18415806208739374, + "learning_rate": 1.5931728745501675e-05, + "loss": 0.3025, + "num_tokens": 1503010898.0, + "step": 1971 + }, + { + "epoch": 2.690983965092337, + "grad_norm": 0.2008670509574121, + "learning_rate": 1.5927569866567504e-05, + "loss": 0.3155, + "num_tokens": 1503749531.0, + "step": 1972 + }, + { + "epoch": 2.6923495460510174, + "grad_norm": 0.17078202044409108, + "learning_rate": 1.5923409484445168e-05, + "loss": 0.289, + "num_tokens": 1504394244.0, + "step": 1973 + }, + { + "epoch": 2.693715127009698, + "grad_norm": 0.20885626768834292, + "learning_rate": 1.5919247600403808e-05, + "loss": 0.3138, + "num_tokens": 1505163958.0, + "step": 1974 + }, + { + "epoch": 2.695080707968378, + "grad_norm": 0.19013767250701316, + "learning_rate": 1.5915084215713036e-05, + "loss": 0.3016, + "num_tokens": 1505977645.0, + "step": 1975 + }, + { + "epoch": 2.6964462889270586, + "grad_norm": 0.2157650224900246, + "learning_rate": 1.5910919331642932e-05, + "loss": 0.2905, + "num_tokens": 1506674391.0, + "step": 1976 + }, + { + "epoch": 2.697811869885739, + "grad_norm": 0.18711202319051498, + "learning_rate": 1.5906752949464008e-05, + "loss": 0.3015, + "num_tokens": 1507423484.0, + "step": 1977 + }, + { + "epoch": 2.6991774508444197, + "grad_norm": 0.1778861821652975, + "learning_rate": 1.5902585070447257e-05, + "loss": 0.318, + "num_tokens": 1508212358.0, + "step": 1978 + }, + { + "epoch": 2.7005430318031003, + "grad_norm": 0.20251745426710593, + "learning_rate": 1.5898415695864113e-05, + "loss": 0.3016, + "num_tokens": 1508937222.0, + "step": 1979 + }, + { + "epoch": 2.701908612761781, + "grad_norm": 0.16643135541228601, + "learning_rate": 1.589424482698647e-05, + "loss": 0.322, + "num_tokens": 1509691104.0, + "step": 1980 + }, + { + "epoch": 2.7032741937204614, + "grad_norm": 0.20377538112996607, + "learning_rate": 1.5890072465086685e-05, + "loss": 0.3012, + "num_tokens": 1510510392.0, + "step": 1981 + }, + { + "epoch": 2.704639774679142, + "grad_norm": 0.19916106795507363, + "learning_rate": 1.5885898611437558e-05, + "loss": 0.2898, + "num_tokens": 1511252490.0, + "step": 1982 + }, + { + "epoch": 2.7060053556378225, + "grad_norm": 0.1736690189823564, + "learning_rate": 1.588172326731236e-05, + "loss": 0.3193, + "num_tokens": 1512143817.0, + "step": 1983 + }, + { + "epoch": 2.7073709365965026, + "grad_norm": 0.1962914970834948, + "learning_rate": 1.58775464339848e-05, + "loss": 0.3147, + "num_tokens": 1512873263.0, + "step": 1984 + }, + { + "epoch": 2.708736517555183, + "grad_norm": 0.19621335265369574, + "learning_rate": 1.5873368112729052e-05, + "loss": 0.3187, + "num_tokens": 1513651686.0, + "step": 1985 + }, + { + "epoch": 2.7101020985138637, + "grad_norm": 0.19113622152082774, + "learning_rate": 1.586918830481974e-05, + "loss": 0.2903, + "num_tokens": 1514385434.0, + "step": 1986 + }, + { + "epoch": 2.7114676794725443, + "grad_norm": 0.17762349135599528, + "learning_rate": 1.586500701153194e-05, + "loss": 0.3094, + "num_tokens": 1515164356.0, + "step": 1987 + }, + { + "epoch": 2.712833260431225, + "grad_norm": 0.1918050094962549, + "learning_rate": 1.586082423414119e-05, + "loss": 0.2999, + "num_tokens": 1515922639.0, + "step": 1988 + }, + { + "epoch": 2.7141988413899054, + "grad_norm": 0.1830446737747264, + "learning_rate": 1.585663997392347e-05, + "loss": 0.3105, + "num_tokens": 1516724373.0, + "step": 1989 + }, + { + "epoch": 2.715564422348586, + "grad_norm": 0.18818035744910167, + "learning_rate": 1.5852454232155216e-05, + "loss": 0.3078, + "num_tokens": 1517540615.0, + "step": 1990 + }, + { + "epoch": 2.7169300033072665, + "grad_norm": 0.16958844603642678, + "learning_rate": 1.5848267010113318e-05, + "loss": 0.3056, + "num_tokens": 1518259705.0, + "step": 1991 + }, + { + "epoch": 2.718295584265947, + "grad_norm": 0.1940516565257873, + "learning_rate": 1.584407830907512e-05, + "loss": 0.3082, + "num_tokens": 1519056670.0, + "step": 1992 + }, + { + "epoch": 2.719661165224627, + "grad_norm": 0.18102341432963825, + "learning_rate": 1.58398881303184e-05, + "loss": 0.3071, + "num_tokens": 1519831618.0, + "step": 1993 + }, + { + "epoch": 2.7210267461833078, + "grad_norm": 0.1902441731473542, + "learning_rate": 1.5835696475121418e-05, + "loss": 0.3139, + "num_tokens": 1520558945.0, + "step": 1994 + }, + { + "epoch": 2.7223923271419883, + "grad_norm": 0.18505147682469275, + "learning_rate": 1.5831503344762854e-05, + "loss": 0.3088, + "num_tokens": 1521319408.0, + "step": 1995 + }, + { + "epoch": 2.723757908100669, + "grad_norm": 0.17786564973003985, + "learning_rate": 1.5827308740521853e-05, + "loss": 0.3065, + "num_tokens": 1522131685.0, + "step": 1996 + }, + { + "epoch": 2.7251234890593494, + "grad_norm": 0.2111569063192298, + "learning_rate": 1.582311266367801e-05, + "loss": 0.2968, + "num_tokens": 1522796011.0, + "step": 1997 + }, + { + "epoch": 2.72648907001803, + "grad_norm": 0.30047040104391864, + "learning_rate": 1.5818915115511363e-05, + "loss": 0.3014, + "num_tokens": 1523514448.0, + "step": 1998 + }, + { + "epoch": 2.7278546509767105, + "grad_norm": 0.2085879115477984, + "learning_rate": 1.58147160973024e-05, + "loss": 0.3172, + "num_tokens": 1524244415.0, + "step": 1999 + }, + { + "epoch": 2.729220231935391, + "grad_norm": 0.17196797697629762, + "learning_rate": 1.5810515610332072e-05, + "loss": 0.3162, + "num_tokens": 1525073445.0, + "step": 2000 + }, + { + "epoch": 2.7305858128940717, + "grad_norm": 0.17956950253335185, + "learning_rate": 1.5806313655881755e-05, + "loss": 0.3038, + "num_tokens": 1525836411.0, + "step": 2001 + }, + { + "epoch": 2.7319513938527518, + "grad_norm": 0.18863467971230788, + "learning_rate": 1.5802110235233284e-05, + "loss": 0.3157, + "num_tokens": 1526555017.0, + "step": 2002 + }, + { + "epoch": 2.7333169748114323, + "grad_norm": 0.1924796762970696, + "learning_rate": 1.579790534966894e-05, + "loss": 0.3038, + "num_tokens": 1527290561.0, + "step": 2003 + }, + { + "epoch": 2.734682555770113, + "grad_norm": 0.17473338872740446, + "learning_rate": 1.5793699000471453e-05, + "loss": 0.3194, + "num_tokens": 1528144211.0, + "step": 2004 + }, + { + "epoch": 2.7360481367287934, + "grad_norm": 0.17859681502213878, + "learning_rate": 1.5789491188924e-05, + "loss": 0.3176, + "num_tokens": 1528940233.0, + "step": 2005 + }, + { + "epoch": 2.737413717687474, + "grad_norm": 0.1780843190279203, + "learning_rate": 1.57852819163102e-05, + "loss": 0.3132, + "num_tokens": 1529739089.0, + "step": 2006 + }, + { + "epoch": 2.7387792986461545, + "grad_norm": 0.17241840122098126, + "learning_rate": 1.5781071183914124e-05, + "loss": 0.3148, + "num_tokens": 1530535202.0, + "step": 2007 + }, + { + "epoch": 2.740144879604835, + "grad_norm": 0.16736850481255391, + "learning_rate": 1.5776858993020275e-05, + "loss": 0.3231, + "num_tokens": 1531366180.0, + "step": 2008 + }, + { + "epoch": 2.7415104605635157, + "grad_norm": 0.18064793122751724, + "learning_rate": 1.577264534491362e-05, + "loss": 0.2963, + "num_tokens": 1532130781.0, + "step": 2009 + }, + { + "epoch": 2.742876041522196, + "grad_norm": 0.17049947476077676, + "learning_rate": 1.5768430240879553e-05, + "loss": 0.2989, + "num_tokens": 1532831553.0, + "step": 2010 + }, + { + "epoch": 2.7442416224808763, + "grad_norm": 0.1804073326101257, + "learning_rate": 1.5764213682203927e-05, + "loss": 0.31, + "num_tokens": 1533554243.0, + "step": 2011 + }, + { + "epoch": 2.745607203439557, + "grad_norm": 0.1830577992918019, + "learning_rate": 1.575999567017302e-05, + "loss": 0.3049, + "num_tokens": 1534274206.0, + "step": 2012 + }, + { + "epoch": 2.7469727843982374, + "grad_norm": 0.18012470517482207, + "learning_rate": 1.5755776206073575e-05, + "loss": 0.2921, + "num_tokens": 1535003594.0, + "step": 2013 + }, + { + "epoch": 2.748338365356918, + "grad_norm": 0.16839631479627096, + "learning_rate": 1.5751555291192767e-05, + "loss": 0.2966, + "num_tokens": 1535742456.0, + "step": 2014 + }, + { + "epoch": 2.7497039463155986, + "grad_norm": 0.17717540871324788, + "learning_rate": 1.574733292681821e-05, + "loss": 0.294, + "num_tokens": 1536504692.0, + "step": 2015 + }, + { + "epoch": 2.751069527274279, + "grad_norm": 0.16826412657979858, + "learning_rate": 1.5743109114237968e-05, + "loss": 0.3048, + "num_tokens": 1537413578.0, + "step": 2016 + }, + { + "epoch": 2.7524351082329597, + "grad_norm": 0.16936450271806502, + "learning_rate": 1.573888385474054e-05, + "loss": 0.3175, + "num_tokens": 1538242925.0, + "step": 2017 + }, + { + "epoch": 2.7538006891916402, + "grad_norm": 0.17457369347799437, + "learning_rate": 1.573465714961487e-05, + "loss": 0.3184, + "num_tokens": 1539119585.0, + "step": 2018 + }, + { + "epoch": 2.755166270150321, + "grad_norm": 0.17287512750451553, + "learning_rate": 1.573042900015035e-05, + "loss": 0.309, + "num_tokens": 1539886400.0, + "step": 2019 + }, + { + "epoch": 2.756531851109001, + "grad_norm": 0.17748656146744635, + "learning_rate": 1.5726199407636794e-05, + "loss": 0.3206, + "num_tokens": 1540662660.0, + "step": 2020 + }, + { + "epoch": 2.7578974320676815, + "grad_norm": 0.17986026394182325, + "learning_rate": 1.5721968373364475e-05, + "loss": 0.3009, + "num_tokens": 1541425254.0, + "step": 2021 + }, + { + "epoch": 2.759263013026362, + "grad_norm": 0.20611931559420996, + "learning_rate": 1.5717735898624094e-05, + "loss": 0.3087, + "num_tokens": 1542138955.0, + "step": 2022 + }, + { + "epoch": 2.7606285939850426, + "grad_norm": 0.17915543676970772, + "learning_rate": 1.5713501984706803e-05, + "loss": 0.3176, + "num_tokens": 1542946291.0, + "step": 2023 + }, + { + "epoch": 2.761994174943723, + "grad_norm": 0.1849957986779023, + "learning_rate": 1.5709266632904176e-05, + "loss": 0.3147, + "num_tokens": 1543757717.0, + "step": 2024 + }, + { + "epoch": 2.7633597559024037, + "grad_norm": 0.17867712404877176, + "learning_rate": 1.5705029844508242e-05, + "loss": 0.2987, + "num_tokens": 1544601415.0, + "step": 2025 + }, + { + "epoch": 2.7647253368610842, + "grad_norm": 0.18016826128694488, + "learning_rate": 1.5700791620811463e-05, + "loss": 0.3046, + "num_tokens": 1545311476.0, + "step": 2026 + }, + { + "epoch": 2.766090917819765, + "grad_norm": 0.18206916032855192, + "learning_rate": 1.569655196310673e-05, + "loss": 0.3038, + "num_tokens": 1546033137.0, + "step": 2027 + }, + { + "epoch": 2.7674564987784454, + "grad_norm": 0.18779454141894894, + "learning_rate": 1.5692310872687386e-05, + "loss": 0.3078, + "num_tokens": 1546851226.0, + "step": 2028 + }, + { + "epoch": 2.7688220797371255, + "grad_norm": 0.1850291238511179, + "learning_rate": 1.5688068350847202e-05, + "loss": 0.2995, + "num_tokens": 1547692709.0, + "step": 2029 + }, + { + "epoch": 2.770187660695806, + "grad_norm": 0.17234622168266184, + "learning_rate": 1.5683824398880388e-05, + "loss": 0.319, + "num_tokens": 1548501157.0, + "step": 2030 + }, + { + "epoch": 2.7715532416544866, + "grad_norm": 0.18765325178759032, + "learning_rate": 1.5679579018081583e-05, + "loss": 0.3121, + "num_tokens": 1549293356.0, + "step": 2031 + }, + { + "epoch": 2.772918822613167, + "grad_norm": 0.2002498508813386, + "learning_rate": 1.5675332209745883e-05, + "loss": 0.3138, + "num_tokens": 1550104515.0, + "step": 2032 + }, + { + "epoch": 2.7742844035718477, + "grad_norm": 0.16714120275881097, + "learning_rate": 1.567108397516879e-05, + "loss": 0.3221, + "num_tokens": 1550962342.0, + "step": 2033 + }, + { + "epoch": 2.7756499845305282, + "grad_norm": 0.19469077286333075, + "learning_rate": 1.5666834315646266e-05, + "loss": 0.3138, + "num_tokens": 1551761563.0, + "step": 2034 + }, + { + "epoch": 2.777015565489209, + "grad_norm": 0.2066678404556175, + "learning_rate": 1.56625832324747e-05, + "loss": 0.2952, + "num_tokens": 1552442608.0, + "step": 2035 + }, + { + "epoch": 2.7783811464478894, + "grad_norm": 0.18144005215829043, + "learning_rate": 1.5658330726950903e-05, + "loss": 0.3099, + "num_tokens": 1553169029.0, + "step": 2036 + }, + { + "epoch": 2.77974672740657, + "grad_norm": 0.1817823115530332, + "learning_rate": 1.565407680037214e-05, + "loss": 0.2967, + "num_tokens": 1553935121.0, + "step": 2037 + }, + { + "epoch": 2.78111230836525, + "grad_norm": 0.16833731571563537, + "learning_rate": 1.5649821454036095e-05, + "loss": 0.3037, + "num_tokens": 1554728804.0, + "step": 2038 + }, + { + "epoch": 2.7824778893239306, + "grad_norm": 0.1898468306081889, + "learning_rate": 1.564556468924089e-05, + "loss": 0.3064, + "num_tokens": 1555505128.0, + "step": 2039 + }, + { + "epoch": 2.783843470282611, + "grad_norm": 0.18958211080636667, + "learning_rate": 1.5641306507285083e-05, + "loss": 0.2921, + "num_tokens": 1556268399.0, + "step": 2040 + }, + { + "epoch": 2.7852090512412917, + "grad_norm": 0.17221289746435242, + "learning_rate": 1.5637046909467656e-05, + "loss": 0.3078, + "num_tokens": 1557008983.0, + "step": 2041 + }, + { + "epoch": 2.7865746321999723, + "grad_norm": 0.18247352701471983, + "learning_rate": 1.5632785897088033e-05, + "loss": 0.3156, + "num_tokens": 1557768226.0, + "step": 2042 + }, + { + "epoch": 2.787940213158653, + "grad_norm": 0.1950310392843476, + "learning_rate": 1.562852347144606e-05, + "loss": 0.3083, + "num_tokens": 1558523087.0, + "step": 2043 + }, + { + "epoch": 2.7893057941173334, + "grad_norm": 0.19371996089378746, + "learning_rate": 1.5624259633842022e-05, + "loss": 0.3134, + "num_tokens": 1559323848.0, + "step": 2044 + }, + { + "epoch": 2.790671375076014, + "grad_norm": 0.18054486951423213, + "learning_rate": 1.5619994385576628e-05, + "loss": 0.3047, + "num_tokens": 1560120139.0, + "step": 2045 + }, + { + "epoch": 2.7920369560346945, + "grad_norm": 0.1924141706886034, + "learning_rate": 1.561572772795102e-05, + "loss": 0.3128, + "num_tokens": 1560893595.0, + "step": 2046 + }, + { + "epoch": 2.7934025369933746, + "grad_norm": 0.24394453996484644, + "learning_rate": 1.5611459662266776e-05, + "loss": 0.3161, + "num_tokens": 1561691170.0, + "step": 2047 + }, + { + "epoch": 2.794768117952055, + "grad_norm": 0.18964739816910425, + "learning_rate": 1.5607190189825892e-05, + "loss": 0.3102, + "num_tokens": 1562456890.0, + "step": 2048 + }, + { + "epoch": 2.7961336989107357, + "grad_norm": 0.19406170209338747, + "learning_rate": 1.56029193119308e-05, + "loss": 0.313, + "num_tokens": 1563192703.0, + "step": 2049 + }, + { + "epoch": 2.7974992798694163, + "grad_norm": 0.19325592834061386, + "learning_rate": 1.5598647029884365e-05, + "loss": 0.315, + "num_tokens": 1563969623.0, + "step": 2050 + }, + { + "epoch": 2.798864860828097, + "grad_norm": 0.19646622920678042, + "learning_rate": 1.559437334498987e-05, + "loss": 0.3121, + "num_tokens": 1564667276.0, + "step": 2051 + }, + { + "epoch": 2.8002304417867774, + "grad_norm": 0.18917839608836245, + "learning_rate": 1.5590098258551033e-05, + "loss": 0.3097, + "num_tokens": 1565406796.0, + "step": 2052 + }, + { + "epoch": 2.801596022745458, + "grad_norm": 0.2026016081274513, + "learning_rate": 1.5585821771871997e-05, + "loss": 0.3293, + "num_tokens": 1566213673.0, + "step": 2053 + }, + { + "epoch": 2.8029616037041385, + "grad_norm": 0.18783210680270168, + "learning_rate": 1.5581543886257333e-05, + "loss": 0.3283, + "num_tokens": 1566965418.0, + "step": 2054 + }, + { + "epoch": 2.804327184662819, + "grad_norm": 0.18936274551563168, + "learning_rate": 1.5577264603012038e-05, + "loss": 0.303, + "num_tokens": 1567757793.0, + "step": 2055 + }, + { + "epoch": 2.805692765621499, + "grad_norm": 0.18546471277022178, + "learning_rate": 1.557298392344154e-05, + "loss": 0.3027, + "num_tokens": 1568524984.0, + "step": 2056 + }, + { + "epoch": 2.8070583465801797, + "grad_norm": 0.16717594657101006, + "learning_rate": 1.5568701848851688e-05, + "loss": 0.2963, + "num_tokens": 1569374994.0, + "step": 2057 + }, + { + "epoch": 2.8084239275388603, + "grad_norm": 0.18397270230676047, + "learning_rate": 1.5564418380548754e-05, + "loss": 0.3065, + "num_tokens": 1570128408.0, + "step": 2058 + }, + { + "epoch": 2.809789508497541, + "grad_norm": 0.18190322735558026, + "learning_rate": 1.5560133519839438e-05, + "loss": 0.3133, + "num_tokens": 1570914021.0, + "step": 2059 + }, + { + "epoch": 2.8111550894562214, + "grad_norm": 0.1756713193899078, + "learning_rate": 1.5555847268030867e-05, + "loss": 0.3031, + "num_tokens": 1571679817.0, + "step": 2060 + }, + { + "epoch": 2.812520670414902, + "grad_norm": 0.1786951073297999, + "learning_rate": 1.5551559626430595e-05, + "loss": 0.3006, + "num_tokens": 1572445465.0, + "step": 2061 + }, + { + "epoch": 2.8138862513735825, + "grad_norm": 0.18366943597661836, + "learning_rate": 1.5547270596346588e-05, + "loss": 0.3094, + "num_tokens": 1573239961.0, + "step": 2062 + }, + { + "epoch": 2.815251832332263, + "grad_norm": 0.1655356872478009, + "learning_rate": 1.5542980179087253e-05, + "loss": 0.3125, + "num_tokens": 1574079768.0, + "step": 2063 + }, + { + "epoch": 2.8166174132909436, + "grad_norm": 0.19804365650071404, + "learning_rate": 1.5538688375961403e-05, + "loss": 0.3048, + "num_tokens": 1574845718.0, + "step": 2064 + }, + { + "epoch": 2.8179829942496237, + "grad_norm": 0.17618962572489064, + "learning_rate": 1.553439518827828e-05, + "loss": 0.3177, + "num_tokens": 1575600068.0, + "step": 2065 + }, + { + "epoch": 2.8193485752083043, + "grad_norm": 0.18919250065100415, + "learning_rate": 1.5530100617347555e-05, + "loss": 0.3029, + "num_tokens": 1576348638.0, + "step": 2066 + }, + { + "epoch": 2.820714156166985, + "grad_norm": 0.1874101527724353, + "learning_rate": 1.552580466447932e-05, + "loss": 0.3087, + "num_tokens": 1577105931.0, + "step": 2067 + }, + { + "epoch": 2.8220797371256654, + "grad_norm": 0.19431234424305938, + "learning_rate": 1.5521507330984066e-05, + "loss": 0.3037, + "num_tokens": 1577823053.0, + "step": 2068 + }, + { + "epoch": 2.823445318084346, + "grad_norm": 0.17819570589730788, + "learning_rate": 1.5517208618172742e-05, + "loss": 0.3205, + "num_tokens": 1578623716.0, + "step": 2069 + }, + { + "epoch": 2.8248108990430265, + "grad_norm": 0.19518746684846902, + "learning_rate": 1.5512908527356693e-05, + "loss": 0.3195, + "num_tokens": 1579394406.0, + "step": 2070 + }, + { + "epoch": 2.826176480001707, + "grad_norm": 0.18595276032186542, + "learning_rate": 1.550860705984769e-05, + "loss": 0.3057, + "num_tokens": 1580055876.0, + "step": 2071 + }, + { + "epoch": 2.8275420609603876, + "grad_norm": 0.2161784532935942, + "learning_rate": 1.5504304216957922e-05, + "loss": 0.3069, + "num_tokens": 1580755449.0, + "step": 2072 + }, + { + "epoch": 2.828907641919068, + "grad_norm": 0.1824185715570481, + "learning_rate": 1.55e-05, + "loss": 0.3005, + "num_tokens": 1581506354.0, + "step": 2073 + }, + { + "epoch": 2.8302732228777483, + "grad_norm": 0.18893054996977351, + "learning_rate": 1.5495694410286962e-05, + "loss": 0.3019, + "num_tokens": 1582268051.0, + "step": 2074 + }, + { + "epoch": 2.831638803836429, + "grad_norm": 0.18217246747003787, + "learning_rate": 1.5491387449132247e-05, + "loss": 0.2994, + "num_tokens": 1583024561.0, + "step": 2075 + }, + { + "epoch": 2.8330043847951094, + "grad_norm": 0.18440679043069302, + "learning_rate": 1.548707911784973e-05, + "loss": 0.2872, + "num_tokens": 1583785679.0, + "step": 2076 + }, + { + "epoch": 2.83436996575379, + "grad_norm": 0.17573684818584168, + "learning_rate": 1.548276941775369e-05, + "loss": 0.3179, + "num_tokens": 1584475065.0, + "step": 2077 + }, + { + "epoch": 2.8357355467124705, + "grad_norm": 0.19096204224773403, + "learning_rate": 1.5478458350158832e-05, + "loss": 0.3036, + "num_tokens": 1585276381.0, + "step": 2078 + }, + { + "epoch": 2.837101127671151, + "grad_norm": 0.1749934542593658, + "learning_rate": 1.547414591638028e-05, + "loss": 0.3054, + "num_tokens": 1586060990.0, + "step": 2079 + }, + { + "epoch": 2.8384667086298316, + "grad_norm": 0.16720660377609387, + "learning_rate": 1.5469832117733568e-05, + "loss": 0.2907, + "num_tokens": 1586766488.0, + "step": 2080 + }, + { + "epoch": 2.839832289588512, + "grad_norm": 0.21342029092249312, + "learning_rate": 1.5465516955534646e-05, + "loss": 0.3075, + "num_tokens": 1587427568.0, + "step": 2081 + }, + { + "epoch": 2.8411978705471927, + "grad_norm": 0.18596297208968338, + "learning_rate": 1.546120043109989e-05, + "loss": 0.3087, + "num_tokens": 1588132602.0, + "step": 2082 + }, + { + "epoch": 2.842563451505873, + "grad_norm": 0.19182021182483255, + "learning_rate": 1.5456882545746078e-05, + "loss": 0.3118, + "num_tokens": 1588926680.0, + "step": 2083 + }, + { + "epoch": 2.8439290324645534, + "grad_norm": 0.1870324665270737, + "learning_rate": 1.545256330079041e-05, + "loss": 0.3197, + "num_tokens": 1589785120.0, + "step": 2084 + }, + { + "epoch": 2.845294613423234, + "grad_norm": 0.1848265526921873, + "learning_rate": 1.5448242697550508e-05, + "loss": 0.3045, + "num_tokens": 1590536924.0, + "step": 2085 + }, + { + "epoch": 2.8466601943819145, + "grad_norm": 0.18066425398432745, + "learning_rate": 1.544392073734439e-05, + "loss": 0.3091, + "num_tokens": 1591267859.0, + "step": 2086 + }, + { + "epoch": 2.848025775340595, + "grad_norm": 0.20429050030958537, + "learning_rate": 1.5439597421490512e-05, + "loss": 0.3121, + "num_tokens": 1591984438.0, + "step": 2087 + }, + { + "epoch": 2.8493913562992756, + "grad_norm": 0.18085291524163258, + "learning_rate": 1.543527275130772e-05, + "loss": 0.3005, + "num_tokens": 1592693094.0, + "step": 2088 + }, + { + "epoch": 2.850756937257956, + "grad_norm": 0.16726142382856288, + "learning_rate": 1.543094672811529e-05, + "loss": 0.3053, + "num_tokens": 1593488758.0, + "step": 2089 + }, + { + "epoch": 2.8521225182166368, + "grad_norm": 0.18203249630644283, + "learning_rate": 1.5426619353232896e-05, + "loss": 0.3047, + "num_tokens": 1594261078.0, + "step": 2090 + }, + { + "epoch": 2.8534880991753173, + "grad_norm": 0.1892804476039286, + "learning_rate": 1.5422290627980638e-05, + "loss": 0.314, + "num_tokens": 1594984957.0, + "step": 2091 + }, + { + "epoch": 2.8548536801339974, + "grad_norm": 0.18089035190536298, + "learning_rate": 1.5417960553679025e-05, + "loss": 0.3031, + "num_tokens": 1595755119.0, + "step": 2092 + }, + { + "epoch": 2.856219261092678, + "grad_norm": 0.1866565524238595, + "learning_rate": 1.541362913164897e-05, + "loss": 0.3215, + "num_tokens": 1596549344.0, + "step": 2093 + }, + { + "epoch": 2.8575848420513585, + "grad_norm": 0.20351930556756695, + "learning_rate": 1.5409296363211808e-05, + "loss": 0.3078, + "num_tokens": 1597383512.0, + "step": 2094 + }, + { + "epoch": 2.858950423010039, + "grad_norm": 0.18762002041813716, + "learning_rate": 1.5404962249689275e-05, + "loss": 0.2984, + "num_tokens": 1598205610.0, + "step": 2095 + }, + { + "epoch": 2.8603160039687197, + "grad_norm": 0.16919060871526442, + "learning_rate": 1.5400626792403524e-05, + "loss": 0.3099, + "num_tokens": 1599016746.0, + "step": 2096 + }, + { + "epoch": 2.8616815849274, + "grad_norm": 0.16466552776452642, + "learning_rate": 1.5396289992677115e-05, + "loss": 0.3232, + "num_tokens": 1599903289.0, + "step": 2097 + }, + { + "epoch": 2.8630471658860808, + "grad_norm": 0.19560373521242896, + "learning_rate": 1.539195185183301e-05, + "loss": 0.3076, + "num_tokens": 1600666887.0, + "step": 2098 + }, + { + "epoch": 2.8644127468447613, + "grad_norm": 0.17274749781715565, + "learning_rate": 1.53876123711946e-05, + "loss": 0.3148, + "num_tokens": 1601490092.0, + "step": 2099 + }, + { + "epoch": 2.865778327803442, + "grad_norm": 0.18172910192156871, + "learning_rate": 1.5383271552085663e-05, + "loss": 0.3095, + "num_tokens": 1602203796.0, + "step": 2100 + }, + { + "epoch": 2.867143908762122, + "grad_norm": 0.17310525957497444, + "learning_rate": 1.53789293958304e-05, + "loss": 0.3144, + "num_tokens": 1603040348.0, + "step": 2101 + }, + { + "epoch": 2.8685094897208026, + "grad_norm": 0.17437523702201463, + "learning_rate": 1.5374585903753413e-05, + "loss": 0.3136, + "num_tokens": 1603834290.0, + "step": 2102 + }, + { + "epoch": 2.869875070679483, + "grad_norm": 0.17798869939092224, + "learning_rate": 1.5370241077179717e-05, + "loss": 0.3105, + "num_tokens": 1604588774.0, + "step": 2103 + }, + { + "epoch": 2.8712406516381637, + "grad_norm": 0.18170458080760385, + "learning_rate": 1.5365894917434726e-05, + "loss": 0.3238, + "num_tokens": 1605430958.0, + "step": 2104 + }, + { + "epoch": 2.8726062325968442, + "grad_norm": 0.20264812324393244, + "learning_rate": 1.5361547425844266e-05, + "loss": 0.3083, + "num_tokens": 1606190754.0, + "step": 2105 + }, + { + "epoch": 2.873971813555525, + "grad_norm": 0.1902867326013823, + "learning_rate": 1.5357198603734565e-05, + "loss": 0.3013, + "num_tokens": 1606919157.0, + "step": 2106 + }, + { + "epoch": 2.8753373945142053, + "grad_norm": 0.18229370340010695, + "learning_rate": 1.535284845243227e-05, + "loss": 0.3138, + "num_tokens": 1607734009.0, + "step": 2107 + }, + { + "epoch": 2.876702975472886, + "grad_norm": 0.18395722609050766, + "learning_rate": 1.5348496973264414e-05, + "loss": 0.3067, + "num_tokens": 1608513231.0, + "step": 2108 + }, + { + "epoch": 2.8780685564315664, + "grad_norm": 0.17507617546912901, + "learning_rate": 1.5344144167558453e-05, + "loss": 0.3172, + "num_tokens": 1609290562.0, + "step": 2109 + }, + { + "epoch": 2.8794341373902466, + "grad_norm": 0.20659375898966983, + "learning_rate": 1.5339790036642233e-05, + "loss": 0.3116, + "num_tokens": 1610051268.0, + "step": 2110 + }, + { + "epoch": 2.880799718348927, + "grad_norm": 0.1975041848410723, + "learning_rate": 1.5335434581844016e-05, + "loss": 0.3073, + "num_tokens": 1610792903.0, + "step": 2111 + }, + { + "epoch": 2.8821652993076077, + "grad_norm": 0.1703320019567883, + "learning_rate": 1.5331077804492456e-05, + "loss": 0.3054, + "num_tokens": 1611558246.0, + "step": 2112 + }, + { + "epoch": 2.8835308802662882, + "grad_norm": 0.2002152589199548, + "learning_rate": 1.532671970591662e-05, + "loss": 0.3027, + "num_tokens": 1612286445.0, + "step": 2113 + }, + { + "epoch": 2.884896461224969, + "grad_norm": 0.20061619269187725, + "learning_rate": 1.532236028744598e-05, + "loss": 0.304, + "num_tokens": 1613051462.0, + "step": 2114 + }, + { + "epoch": 2.8862620421836493, + "grad_norm": 0.1941099060287833, + "learning_rate": 1.53179995504104e-05, + "loss": 0.3096, + "num_tokens": 1613820234.0, + "step": 2115 + }, + { + "epoch": 2.88762762314233, + "grad_norm": 0.18705057134418998, + "learning_rate": 1.5313637496140153e-05, + "loss": 0.3029, + "num_tokens": 1614584273.0, + "step": 2116 + }, + { + "epoch": 2.8889932041010105, + "grad_norm": 0.2368040723156937, + "learning_rate": 1.5309274125965913e-05, + "loss": 0.3064, + "num_tokens": 1615355565.0, + "step": 2117 + }, + { + "epoch": 2.890358785059691, + "grad_norm": 0.1758984417740813, + "learning_rate": 1.530490944121876e-05, + "loss": 0.3052, + "num_tokens": 1616110340.0, + "step": 2118 + }, + { + "epoch": 2.891724366018371, + "grad_norm": 0.18290762972473337, + "learning_rate": 1.5300543443230164e-05, + "loss": 0.2994, + "num_tokens": 1616832491.0, + "step": 2119 + }, + { + "epoch": 2.8930899469770517, + "grad_norm": 0.20703595229796132, + "learning_rate": 1.5296176133332004e-05, + "loss": 0.3082, + "num_tokens": 1617561953.0, + "step": 2120 + }, + { + "epoch": 2.8944555279357322, + "grad_norm": 0.20253356335629422, + "learning_rate": 1.529180751285656e-05, + "loss": 0.3141, + "num_tokens": 1618327028.0, + "step": 2121 + }, + { + "epoch": 2.895821108894413, + "grad_norm": 0.1845378774891888, + "learning_rate": 1.52874375831365e-05, + "loss": 0.3131, + "num_tokens": 1619058169.0, + "step": 2122 + }, + { + "epoch": 2.8971866898530934, + "grad_norm": 0.2175473032452687, + "learning_rate": 1.528306634550491e-05, + "loss": 0.326, + "num_tokens": 1619756907.0, + "step": 2123 + }, + { + "epoch": 2.898552270811774, + "grad_norm": 0.19070636539816416, + "learning_rate": 1.5278693801295263e-05, + "loss": 0.3099, + "num_tokens": 1620564265.0, + "step": 2124 + }, + { + "epoch": 2.8999178517704545, + "grad_norm": 0.19225487616592124, + "learning_rate": 1.527431995184143e-05, + "loss": 0.3114, + "num_tokens": 1621323055.0, + "step": 2125 + }, + { + "epoch": 2.901283432729135, + "grad_norm": 0.19770345864034772, + "learning_rate": 1.5269944798477683e-05, + "loss": 0.3189, + "num_tokens": 1622043973.0, + "step": 2126 + }, + { + "epoch": 2.9026490136878156, + "grad_norm": 0.18285251218820472, + "learning_rate": 1.5265568342538698e-05, + "loss": 0.3045, + "num_tokens": 1622805818.0, + "step": 2127 + }, + { + "epoch": 2.9040145946464957, + "grad_norm": 0.17723271278829994, + "learning_rate": 1.5261190585359535e-05, + "loss": 0.3041, + "num_tokens": 1623593197.0, + "step": 2128 + }, + { + "epoch": 2.9053801756051763, + "grad_norm": 0.1877148446131528, + "learning_rate": 1.5256811528275661e-05, + "loss": 0.326, + "num_tokens": 1624397773.0, + "step": 2129 + }, + { + "epoch": 2.906745756563857, + "grad_norm": 0.18004737962838546, + "learning_rate": 1.5252431172622938e-05, + "loss": 0.3013, + "num_tokens": 1625200763.0, + "step": 2130 + }, + { + "epoch": 2.9081113375225374, + "grad_norm": 0.19137899856987836, + "learning_rate": 1.5248049519737618e-05, + "loss": 0.3197, + "num_tokens": 1625924357.0, + "step": 2131 + }, + { + "epoch": 2.909476918481218, + "grad_norm": 0.20375145438236403, + "learning_rate": 1.524366657095636e-05, + "loss": 0.3061, + "num_tokens": 1626661744.0, + "step": 2132 + }, + { + "epoch": 2.9108424994398985, + "grad_norm": 0.18835986689714515, + "learning_rate": 1.5239282327616207e-05, + "loss": 0.3002, + "num_tokens": 1627388872.0, + "step": 2133 + }, + { + "epoch": 2.912208080398579, + "grad_norm": 0.20009699897484454, + "learning_rate": 1.5234896791054603e-05, + "loss": 0.302, + "num_tokens": 1628169555.0, + "step": 2134 + }, + { + "epoch": 2.9135736613572596, + "grad_norm": 0.1882383207583699, + "learning_rate": 1.523050996260939e-05, + "loss": 0.3113, + "num_tokens": 1628909147.0, + "step": 2135 + }, + { + "epoch": 2.91493924231594, + "grad_norm": 0.20665657439419013, + "learning_rate": 1.522612184361879e-05, + "loss": 0.3195, + "num_tokens": 1629736045.0, + "step": 2136 + }, + { + "epoch": 2.9163048232746203, + "grad_norm": 0.22556934474825374, + "learning_rate": 1.5221732435421436e-05, + "loss": 0.3083, + "num_tokens": 1630540779.0, + "step": 2137 + }, + { + "epoch": 2.917670404233301, + "grad_norm": 0.1984581438575971, + "learning_rate": 1.5217341739356343e-05, + "loss": 0.3155, + "num_tokens": 1631334054.0, + "step": 2138 + }, + { + "epoch": 2.9190359851919814, + "grad_norm": 0.2129716694813734, + "learning_rate": 1.5212949756762925e-05, + "loss": 0.3183, + "num_tokens": 1632106899.0, + "step": 2139 + }, + { + "epoch": 2.920401566150662, + "grad_norm": 0.19883198534267346, + "learning_rate": 1.5208556488980984e-05, + "loss": 0.2858, + "num_tokens": 1632859433.0, + "step": 2140 + }, + { + "epoch": 2.9217671471093425, + "grad_norm": 0.21648750861286428, + "learning_rate": 1.5204161937350713e-05, + "loss": 0.3188, + "num_tokens": 1633574311.0, + "step": 2141 + }, + { + "epoch": 2.923132728068023, + "grad_norm": 0.17798198891781183, + "learning_rate": 1.5199766103212701e-05, + "loss": 0.2942, + "num_tokens": 1634431326.0, + "step": 2142 + }, + { + "epoch": 2.9244983090267036, + "grad_norm": 0.2003664857466294, + "learning_rate": 1.5195368987907931e-05, + "loss": 0.3023, + "num_tokens": 1635163965.0, + "step": 2143 + }, + { + "epoch": 2.925863889985384, + "grad_norm": 0.19817568897495416, + "learning_rate": 1.5190970592777763e-05, + "loss": 0.3013, + "num_tokens": 1635900716.0, + "step": 2144 + }, + { + "epoch": 2.9272294709440647, + "grad_norm": 0.1984341540971505, + "learning_rate": 1.5186570919163965e-05, + "loss": 0.3057, + "num_tokens": 1636621730.0, + "step": 2145 + }, + { + "epoch": 2.928595051902745, + "grad_norm": 0.20146093928000117, + "learning_rate": 1.5182169968408686e-05, + "loss": 0.3133, + "num_tokens": 1637412984.0, + "step": 2146 + }, + { + "epoch": 2.9299606328614254, + "grad_norm": 0.17766358689088885, + "learning_rate": 1.5177767741854461e-05, + "loss": 0.2939, + "num_tokens": 1638112779.0, + "step": 2147 + }, + { + "epoch": 2.931326213820106, + "grad_norm": 0.19692550994476404, + "learning_rate": 1.5173364240844226e-05, + "loss": 0.3243, + "num_tokens": 1638911250.0, + "step": 2148 + }, + { + "epoch": 2.9326917947787865, + "grad_norm": 0.20655734511780335, + "learning_rate": 1.5168959466721291e-05, + "loss": 0.3037, + "num_tokens": 1639667027.0, + "step": 2149 + }, + { + "epoch": 2.934057375737467, + "grad_norm": 0.2292820618951723, + "learning_rate": 1.5164553420829367e-05, + "loss": 0.318, + "num_tokens": 1640437047.0, + "step": 2150 + }, + { + "epoch": 2.9354229566961476, + "grad_norm": 0.19969420773824326, + "learning_rate": 1.5160146104512546e-05, + "loss": 0.3073, + "num_tokens": 1641213566.0, + "step": 2151 + }, + { + "epoch": 2.936788537654828, + "grad_norm": 0.19694906216974098, + "learning_rate": 1.5155737519115308e-05, + "loss": 0.3038, + "num_tokens": 1641946172.0, + "step": 2152 + }, + { + "epoch": 2.9381541186135087, + "grad_norm": 0.20249739089745913, + "learning_rate": 1.5151327665982522e-05, + "loss": 0.2993, + "num_tokens": 1642609631.0, + "step": 2153 + }, + { + "epoch": 2.9395196995721893, + "grad_norm": 0.1957259808699525, + "learning_rate": 1.5146916546459443e-05, + "loss": 0.303, + "num_tokens": 1643321452.0, + "step": 2154 + }, + { + "epoch": 2.9408852805308694, + "grad_norm": 0.19675826511115072, + "learning_rate": 1.5142504161891715e-05, + "loss": 0.3093, + "num_tokens": 1644123007.0, + "step": 2155 + }, + { + "epoch": 2.94225086148955, + "grad_norm": 0.17509380023595492, + "learning_rate": 1.5138090513625362e-05, + "loss": 0.3045, + "num_tokens": 1644928081.0, + "step": 2156 + }, + { + "epoch": 2.9436164424482305, + "grad_norm": 0.19334999669989342, + "learning_rate": 1.5133675603006801e-05, + "loss": 0.2984, + "num_tokens": 1645636129.0, + "step": 2157 + }, + { + "epoch": 2.944982023406911, + "grad_norm": 0.19482735678217422, + "learning_rate": 1.512925943138283e-05, + "loss": 0.3059, + "num_tokens": 1646411454.0, + "step": 2158 + }, + { + "epoch": 2.9463476043655916, + "grad_norm": 0.17760842062601973, + "learning_rate": 1.5124842000100625e-05, + "loss": 0.3109, + "num_tokens": 1647244853.0, + "step": 2159 + }, + { + "epoch": 2.947713185324272, + "grad_norm": 0.20471910085021677, + "learning_rate": 1.5120423310507762e-05, + "loss": 0.3112, + "num_tokens": 1648000197.0, + "step": 2160 + }, + { + "epoch": 2.9490787662829527, + "grad_norm": 0.19125507683006296, + "learning_rate": 1.5116003363952184e-05, + "loss": 0.3155, + "num_tokens": 1648780753.0, + "step": 2161 + }, + { + "epoch": 2.9504443472416333, + "grad_norm": 0.17952067623025666, + "learning_rate": 1.5111582161782228e-05, + "loss": 0.3059, + "num_tokens": 1649501775.0, + "step": 2162 + }, + { + "epoch": 2.951809928200314, + "grad_norm": 0.18988718754676528, + "learning_rate": 1.5107159705346612e-05, + "loss": 0.3079, + "num_tokens": 1650238419.0, + "step": 2163 + }, + { + "epoch": 2.953175509158994, + "grad_norm": 0.17932709142277242, + "learning_rate": 1.5102735995994438e-05, + "loss": 0.3063, + "num_tokens": 1650995298.0, + "step": 2164 + }, + { + "epoch": 2.9545410901176745, + "grad_norm": 0.1995439969083902, + "learning_rate": 1.5098311035075185e-05, + "loss": 0.3166, + "num_tokens": 1651729942.0, + "step": 2165 + }, + { + "epoch": 2.955906671076355, + "grad_norm": 0.17500159920829444, + "learning_rate": 1.5093884823938715e-05, + "loss": 0.3068, + "num_tokens": 1652519149.0, + "step": 2166 + }, + { + "epoch": 2.9572722520350356, + "grad_norm": 0.1972690426327307, + "learning_rate": 1.508945736393528e-05, + "loss": 0.3198, + "num_tokens": 1653227951.0, + "step": 2167 + }, + { + "epoch": 2.958637832993716, + "grad_norm": 0.18162593161012908, + "learning_rate": 1.5085028656415503e-05, + "loss": 0.3094, + "num_tokens": 1654014993.0, + "step": 2168 + }, + { + "epoch": 2.9600034139523967, + "grad_norm": 0.18362743145140545, + "learning_rate": 1.5080598702730387e-05, + "loss": 0.2924, + "num_tokens": 1654756241.0, + "step": 2169 + }, + { + "epoch": 2.9613689949110773, + "grad_norm": 0.17654844109249448, + "learning_rate": 1.5076167504231327e-05, + "loss": 0.2934, + "num_tokens": 1655450021.0, + "step": 2170 + }, + { + "epoch": 2.962734575869758, + "grad_norm": 0.1771805601557758, + "learning_rate": 1.5071735062270084e-05, + "loss": 0.3142, + "num_tokens": 1656181156.0, + "step": 2171 + }, + { + "epoch": 2.9641001568284384, + "grad_norm": 0.19159106797596323, + "learning_rate": 1.5067301378198801e-05, + "loss": 0.3037, + "num_tokens": 1656924363.0, + "step": 2172 + }, + { + "epoch": 2.9654657377871185, + "grad_norm": 0.19595295247745959, + "learning_rate": 1.5062866453370012e-05, + "loss": 0.3131, + "num_tokens": 1657669419.0, + "step": 2173 + }, + { + "epoch": 2.966831318745799, + "grad_norm": 0.19002962163436748, + "learning_rate": 1.5058430289136616e-05, + "loss": 0.2921, + "num_tokens": 1658374370.0, + "step": 2174 + }, + { + "epoch": 2.9681968997044796, + "grad_norm": 0.17831954932973135, + "learning_rate": 1.5053992886851892e-05, + "loss": 0.322, + "num_tokens": 1659198922.0, + "step": 2175 + }, + { + "epoch": 2.96956248066316, + "grad_norm": 0.18637839599764777, + "learning_rate": 1.5049554247869503e-05, + "loss": 0.3041, + "num_tokens": 1659993027.0, + "step": 2176 + }, + { + "epoch": 2.9709280616218408, + "grad_norm": 0.17905577515401397, + "learning_rate": 1.5045114373543484e-05, + "loss": 0.3021, + "num_tokens": 1660723056.0, + "step": 2177 + }, + { + "epoch": 2.9722936425805213, + "grad_norm": 0.18161086183452435, + "learning_rate": 1.5040673265228248e-05, + "loss": 0.3166, + "num_tokens": 1661458478.0, + "step": 2178 + }, + { + "epoch": 2.973659223539202, + "grad_norm": 0.1729246987548811, + "learning_rate": 1.5036230924278585e-05, + "loss": 0.3102, + "num_tokens": 1662256130.0, + "step": 2179 + }, + { + "epoch": 2.9750248044978824, + "grad_norm": 0.17081408096723785, + "learning_rate": 1.5031787352049665e-05, + "loss": 0.2925, + "num_tokens": 1663002835.0, + "step": 2180 + }, + { + "epoch": 2.976390385456563, + "grad_norm": 0.1800011919152803, + "learning_rate": 1.502734254989702e-05, + "loss": 0.3088, + "num_tokens": 1663769802.0, + "step": 2181 + }, + { + "epoch": 2.977755966415243, + "grad_norm": 0.1667537178179067, + "learning_rate": 1.5022896519176577e-05, + "loss": 0.3161, + "num_tokens": 1664645047.0, + "step": 2182 + }, + { + "epoch": 2.9791215473739237, + "grad_norm": 0.17073663296944946, + "learning_rate": 1.5018449261244623e-05, + "loss": 0.3021, + "num_tokens": 1665403249.0, + "step": 2183 + }, + { + "epoch": 2.980487128332604, + "grad_norm": 0.1819218911819258, + "learning_rate": 1.501400077745782e-05, + "loss": 0.3111, + "num_tokens": 1666182797.0, + "step": 2184 + }, + { + "epoch": 2.9818527092912848, + "grad_norm": 0.18629202355330532, + "learning_rate": 1.5009551069173214e-05, + "loss": 0.3079, + "num_tokens": 1666911464.0, + "step": 2185 + }, + { + "epoch": 2.9832182902499653, + "grad_norm": 0.176975324111544, + "learning_rate": 1.5005100137748214e-05, + "loss": 0.3081, + "num_tokens": 1667753519.0, + "step": 2186 + }, + { + "epoch": 2.984583871208646, + "grad_norm": 0.1682447333768825, + "learning_rate": 1.500064798454061e-05, + "loss": 0.3144, + "num_tokens": 1668592779.0, + "step": 2187 + }, + { + "epoch": 2.9859494521673264, + "grad_norm": 0.18156437103065917, + "learning_rate": 1.4996194610908557e-05, + "loss": 0.3078, + "num_tokens": 1669353988.0, + "step": 2188 + }, + { + "epoch": 2.987315033126007, + "grad_norm": 0.17340718832087504, + "learning_rate": 1.4991740018210594e-05, + "loss": 0.2991, + "num_tokens": 1670090523.0, + "step": 2189 + }, + { + "epoch": 2.9886806140846875, + "grad_norm": 0.17694651077371376, + "learning_rate": 1.4987284207805614e-05, + "loss": 0.312, + "num_tokens": 1670884394.0, + "step": 2190 + }, + { + "epoch": 2.9900461950433677, + "grad_norm": 0.1883532757941866, + "learning_rate": 1.49828271810529e-05, + "loss": 0.3183, + "num_tokens": 1671655906.0, + "step": 2191 + }, + { + "epoch": 2.991411776002048, + "grad_norm": 0.17640232933837383, + "learning_rate": 1.497836893931209e-05, + "loss": 0.3135, + "num_tokens": 1672458813.0, + "step": 2192 + }, + { + "epoch": 2.9927773569607288, + "grad_norm": 0.17405100580136232, + "learning_rate": 1.4973909483943208e-05, + "loss": 0.3, + "num_tokens": 1673210726.0, + "step": 2193 + }, + { + "epoch": 2.9941429379194093, + "grad_norm": 0.1912232751370459, + "learning_rate": 1.4969448816306638e-05, + "loss": 0.3095, + "num_tokens": 1673992648.0, + "step": 2194 + }, + { + "epoch": 2.99550851887809, + "grad_norm": 0.17870899580522215, + "learning_rate": 1.4964986937763135e-05, + "loss": 0.3005, + "num_tokens": 1674753102.0, + "step": 2195 + }, + { + "epoch": 2.9968740998367704, + "grad_norm": 0.17104428158078847, + "learning_rate": 1.4960523849673826e-05, + "loss": 0.3015, + "num_tokens": 1675506607.0, + "step": 2196 + }, + { + "epoch": 2.998239680795451, + "grad_norm": 0.18408415612995824, + "learning_rate": 1.4956059553400211e-05, + "loss": 0.3096, + "num_tokens": 1676226838.0, + "step": 2197 + }, + { + "epoch": 2.9996052617541316, + "grad_norm": 0.18334579443981053, + "learning_rate": 1.4951594050304146e-05, + "loss": 0.3038, + "num_tokens": 1676898951.0, + "step": 2198 + }, + { + "epoch": 3.0, + "grad_norm": 0.18334579443981053, + "learning_rate": 1.4947127341747862e-05, + "loss": 0.2979, + "num_tokens": 1677126408.0, + "step": 2199 + }, + { + "epoch": 3.0013655809586806, + "grad_norm": 0.3444156282804254, + "learning_rate": 1.4942659429093962e-05, + "loss": 0.2717, + "num_tokens": 1677916942.0, + "step": 2200 + }, + { + "epoch": 3.002731161917361, + "grad_norm": 0.29108608662801516, + "learning_rate": 1.4938190313705416e-05, + "loss": 0.2619, + "num_tokens": 1678717166.0, + "step": 2201 + }, + { + "epoch": 3.0040967428760417, + "grad_norm": 0.2562279535482217, + "learning_rate": 1.4933719996945552e-05, + "loss": 0.2665, + "num_tokens": 1679458334.0, + "step": 2202 + }, + { + "epoch": 3.0054623238347222, + "grad_norm": 0.2041129811610529, + "learning_rate": 1.492924848017807e-05, + "loss": 0.2718, + "num_tokens": 1680236898.0, + "step": 2203 + }, + { + "epoch": 3.0068279047934023, + "grad_norm": 0.32243815358448286, + "learning_rate": 1.492477576476704e-05, + "loss": 0.285, + "num_tokens": 1681041707.0, + "step": 2204 + }, + { + "epoch": 3.008193485752083, + "grad_norm": 0.29003048941352066, + "learning_rate": 1.4920301852076892e-05, + "loss": 0.276, + "num_tokens": 1681775995.0, + "step": 2205 + }, + { + "epoch": 3.0095590667107635, + "grad_norm": 0.2403321672799092, + "learning_rate": 1.4915826743472424e-05, + "loss": 0.2752, + "num_tokens": 1682594543.0, + "step": 2206 + }, + { + "epoch": 3.010924647669444, + "grad_norm": 0.21995235829265286, + "learning_rate": 1.4911350440318793e-05, + "loss": 0.2675, + "num_tokens": 1683357198.0, + "step": 2207 + }, + { + "epoch": 3.0122902286281246, + "grad_norm": 0.23887160898304638, + "learning_rate": 1.4906872943981533e-05, + "loss": 0.2778, + "num_tokens": 1684121225.0, + "step": 2208 + }, + { + "epoch": 3.013655809586805, + "grad_norm": 0.2433321087547303, + "learning_rate": 1.4902394255826527e-05, + "loss": 0.2817, + "num_tokens": 1684923547.0, + "step": 2209 + }, + { + "epoch": 3.0150213905454857, + "grad_norm": 0.20764288209821682, + "learning_rate": 1.4897914377220033e-05, + "loss": 0.2588, + "num_tokens": 1685699250.0, + "step": 2210 + }, + { + "epoch": 3.0163869715041662, + "grad_norm": 0.23732561401948254, + "learning_rate": 1.4893433309528664e-05, + "loss": 0.2672, + "num_tokens": 1686434028.0, + "step": 2211 + }, + { + "epoch": 3.017752552462847, + "grad_norm": 0.23328926515678808, + "learning_rate": 1.4888951054119405e-05, + "loss": 0.2666, + "num_tokens": 1687148227.0, + "step": 2212 + }, + { + "epoch": 3.019118133421527, + "grad_norm": 0.22279346932404173, + "learning_rate": 1.4884467612359597e-05, + "loss": 0.2568, + "num_tokens": 1687929275.0, + "step": 2213 + }, + { + "epoch": 3.0204837143802075, + "grad_norm": 0.19510000105424902, + "learning_rate": 1.487998298561694e-05, + "loss": 0.2785, + "num_tokens": 1688651922.0, + "step": 2214 + }, + { + "epoch": 3.021849295338888, + "grad_norm": 0.22198811521891376, + "learning_rate": 1.4875497175259503e-05, + "loss": 0.2727, + "num_tokens": 1689459178.0, + "step": 2215 + }, + { + "epoch": 3.0232148762975686, + "grad_norm": 0.20222064711836382, + "learning_rate": 1.4871010182655712e-05, + "loss": 0.2722, + "num_tokens": 1690235706.0, + "step": 2216 + }, + { + "epoch": 3.024580457256249, + "grad_norm": 0.20101925668929865, + "learning_rate": 1.4866522009174354e-05, + "loss": 0.2689, + "num_tokens": 1690937730.0, + "step": 2217 + }, + { + "epoch": 3.0259460382149297, + "grad_norm": 0.19204799521617283, + "learning_rate": 1.4862032656184573e-05, + "loss": 0.2827, + "num_tokens": 1691672712.0, + "step": 2218 + }, + { + "epoch": 3.0273116191736102, + "grad_norm": 0.2176374302004761, + "learning_rate": 1.485754212505588e-05, + "loss": 0.2604, + "num_tokens": 1692360559.0, + "step": 2219 + }, + { + "epoch": 3.028677200132291, + "grad_norm": 0.224149606737412, + "learning_rate": 1.4853050417158141e-05, + "loss": 0.2609, + "num_tokens": 1693097004.0, + "step": 2220 + }, + { + "epoch": 3.0300427810909714, + "grad_norm": 0.1966986586497178, + "learning_rate": 1.4848557533861583e-05, + "loss": 0.2737, + "num_tokens": 1693863860.0, + "step": 2221 + }, + { + "epoch": 3.0314083620496515, + "grad_norm": 0.21408858027582164, + "learning_rate": 1.4844063476536785e-05, + "loss": 0.2732, + "num_tokens": 1694616497.0, + "step": 2222 + }, + { + "epoch": 3.032773943008332, + "grad_norm": 0.19806226474534677, + "learning_rate": 1.4839568246554696e-05, + "loss": 0.2698, + "num_tokens": 1695335422.0, + "step": 2223 + }, + { + "epoch": 3.0341395239670126, + "grad_norm": 0.2085523460068365, + "learning_rate": 1.4835071845286608e-05, + "loss": 0.2591, + "num_tokens": 1696106399.0, + "step": 2224 + }, + { + "epoch": 3.035505104925693, + "grad_norm": 0.19466263058273708, + "learning_rate": 1.4830574274104185e-05, + "loss": 0.2789, + "num_tokens": 1696901217.0, + "step": 2225 + }, + { + "epoch": 3.0368706858843737, + "grad_norm": 0.20992490609055456, + "learning_rate": 1.4826075534379441e-05, + "loss": 0.2754, + "num_tokens": 1697703322.0, + "step": 2226 + }, + { + "epoch": 3.0382362668430543, + "grad_norm": 0.21760042672567967, + "learning_rate": 1.4821575627484744e-05, + "loss": 0.2656, + "num_tokens": 1698451933.0, + "step": 2227 + }, + { + "epoch": 3.039601847801735, + "grad_norm": 0.20209876152292114, + "learning_rate": 1.4817074554792821e-05, + "loss": 0.2711, + "num_tokens": 1699266887.0, + "step": 2228 + }, + { + "epoch": 3.0409674287604154, + "grad_norm": 0.2065950437730651, + "learning_rate": 1.4812572317676757e-05, + "loss": 0.2652, + "num_tokens": 1700005537.0, + "step": 2229 + }, + { + "epoch": 3.042333009719096, + "grad_norm": 0.21266386571001472, + "learning_rate": 1.4808068917509984e-05, + "loss": 0.2608, + "num_tokens": 1700755292.0, + "step": 2230 + }, + { + "epoch": 3.043698590677776, + "grad_norm": 0.2033819324565466, + "learning_rate": 1.4803564355666295e-05, + "loss": 0.2694, + "num_tokens": 1701426341.0, + "step": 2231 + }, + { + "epoch": 3.0450641716364566, + "grad_norm": 0.21856298426476906, + "learning_rate": 1.4799058633519845e-05, + "loss": 0.2683, + "num_tokens": 1702180580.0, + "step": 2232 + }, + { + "epoch": 3.046429752595137, + "grad_norm": 0.2030676099008066, + "learning_rate": 1.4794551752445127e-05, + "loss": 0.273, + "num_tokens": 1702977186.0, + "step": 2233 + }, + { + "epoch": 3.0477953335538177, + "grad_norm": 0.19178490525907393, + "learning_rate": 1.4790043713816997e-05, + "loss": 0.2667, + "num_tokens": 1703725763.0, + "step": 2234 + }, + { + "epoch": 3.0491609145124983, + "grad_norm": 0.21793329251451785, + "learning_rate": 1.4785534519010662e-05, + "loss": 0.2711, + "num_tokens": 1704490265.0, + "step": 2235 + }, + { + "epoch": 3.050526495471179, + "grad_norm": 0.18599691837743756, + "learning_rate": 1.4781024169401687e-05, + "loss": 0.2671, + "num_tokens": 1705216386.0, + "step": 2236 + }, + { + "epoch": 3.0518920764298594, + "grad_norm": 0.20586285380129404, + "learning_rate": 1.4776512666365976e-05, + "loss": 0.2722, + "num_tokens": 1705989014.0, + "step": 2237 + }, + { + "epoch": 3.05325765738854, + "grad_norm": 0.19360615454332133, + "learning_rate": 1.4772000011279799e-05, + "loss": 0.2754, + "num_tokens": 1706711821.0, + "step": 2238 + }, + { + "epoch": 3.0546232383472205, + "grad_norm": 0.20145638778911903, + "learning_rate": 1.4767486205519767e-05, + "loss": 0.2708, + "num_tokens": 1707528930.0, + "step": 2239 + }, + { + "epoch": 3.0559888193059006, + "grad_norm": 0.1895104940576205, + "learning_rate": 1.4762971250462855e-05, + "loss": 0.279, + "num_tokens": 1708288649.0, + "step": 2240 + }, + { + "epoch": 3.057354400264581, + "grad_norm": 0.19613206463228147, + "learning_rate": 1.4758455147486373e-05, + "loss": 0.2602, + "num_tokens": 1708976971.0, + "step": 2241 + }, + { + "epoch": 3.0587199812232617, + "grad_norm": 0.2197896441430753, + "learning_rate": 1.4753937897967987e-05, + "loss": 0.2548, + "num_tokens": 1709658194.0, + "step": 2242 + }, + { + "epoch": 3.0600855621819423, + "grad_norm": 0.1948262871334908, + "learning_rate": 1.4749419503285719e-05, + "loss": 0.2841, + "num_tokens": 1710497344.0, + "step": 2243 + }, + { + "epoch": 3.061451143140623, + "grad_norm": 0.19825310629761053, + "learning_rate": 1.4744899964817938e-05, + "loss": 0.2641, + "num_tokens": 1711246475.0, + "step": 2244 + }, + { + "epoch": 3.0628167240993034, + "grad_norm": 0.20244652951535042, + "learning_rate": 1.4740379283943353e-05, + "loss": 0.2972, + "num_tokens": 1712035162.0, + "step": 2245 + }, + { + "epoch": 3.064182305057984, + "grad_norm": 0.20181102642957294, + "learning_rate": 1.4735857462041028e-05, + "loss": 0.2759, + "num_tokens": 1712805969.0, + "step": 2246 + }, + { + "epoch": 3.0655478860166645, + "grad_norm": 0.22250525983278033, + "learning_rate": 1.4731334500490381e-05, + "loss": 0.2766, + "num_tokens": 1713567437.0, + "step": 2247 + }, + { + "epoch": 3.066913466975345, + "grad_norm": 0.19996608881583855, + "learning_rate": 1.4726810400671168e-05, + "loss": 0.2865, + "num_tokens": 1714432834.0, + "step": 2248 + }, + { + "epoch": 3.068279047934025, + "grad_norm": 0.2100388748331839, + "learning_rate": 1.4722285163963492e-05, + "loss": 0.2632, + "num_tokens": 1715198984.0, + "step": 2249 + }, + { + "epoch": 3.0696446288927057, + "grad_norm": 0.19491158630472177, + "learning_rate": 1.4717758791747817e-05, + "loss": 0.2744, + "num_tokens": 1716030529.0, + "step": 2250 + }, + { + "epoch": 3.0710102098513863, + "grad_norm": 0.1950865257467894, + "learning_rate": 1.4713231285404934e-05, + "loss": 0.2543, + "num_tokens": 1716773004.0, + "step": 2251 + }, + { + "epoch": 3.072375790810067, + "grad_norm": 0.1858118096923295, + "learning_rate": 1.470870264631599e-05, + "loss": 0.2648, + "num_tokens": 1717459740.0, + "step": 2252 + }, + { + "epoch": 3.0737413717687474, + "grad_norm": 0.2144494873951648, + "learning_rate": 1.4704172875862483e-05, + "loss": 0.2607, + "num_tokens": 1718233093.0, + "step": 2253 + }, + { + "epoch": 3.075106952727428, + "grad_norm": 0.20119930563263919, + "learning_rate": 1.4699641975426245e-05, + "loss": 0.2834, + "num_tokens": 1718984808.0, + "step": 2254 + }, + { + "epoch": 3.0764725336861085, + "grad_norm": 0.2031480891942438, + "learning_rate": 1.4695109946389457e-05, + "loss": 0.2796, + "num_tokens": 1719749428.0, + "step": 2255 + }, + { + "epoch": 3.077838114644789, + "grad_norm": 0.20016253818939397, + "learning_rate": 1.4690576790134645e-05, + "loss": 0.2653, + "num_tokens": 1720501981.0, + "step": 2256 + }, + { + "epoch": 3.0792036956034696, + "grad_norm": 0.19431408912614256, + "learning_rate": 1.468604250804468e-05, + "loss": 0.2724, + "num_tokens": 1721285224.0, + "step": 2257 + }, + { + "epoch": 3.0805692765621497, + "grad_norm": 0.19346712776447386, + "learning_rate": 1.4681507101502776e-05, + "loss": 0.2802, + "num_tokens": 1722057796.0, + "step": 2258 + }, + { + "epoch": 3.0819348575208303, + "grad_norm": 0.21536613128401977, + "learning_rate": 1.4676970571892488e-05, + "loss": 0.279, + "num_tokens": 1722793551.0, + "step": 2259 + }, + { + "epoch": 3.083300438479511, + "grad_norm": 0.22407404542449041, + "learning_rate": 1.4672432920597714e-05, + "loss": 0.2784, + "num_tokens": 1723508822.0, + "step": 2260 + }, + { + "epoch": 3.0846660194381914, + "grad_norm": 0.20391085926257044, + "learning_rate": 1.4667894149002695e-05, + "loss": 0.2562, + "num_tokens": 1724308513.0, + "step": 2261 + }, + { + "epoch": 3.086031600396872, + "grad_norm": 0.22101200362401402, + "learning_rate": 1.4663354258492016e-05, + "loss": 0.2618, + "num_tokens": 1725062416.0, + "step": 2262 + }, + { + "epoch": 3.0873971813555525, + "grad_norm": 0.2338530629617751, + "learning_rate": 1.4658813250450597e-05, + "loss": 0.2815, + "num_tokens": 1725880813.0, + "step": 2263 + }, + { + "epoch": 3.088762762314233, + "grad_norm": 0.19856320981260248, + "learning_rate": 1.4654271126263703e-05, + "loss": 0.2761, + "num_tokens": 1726610715.0, + "step": 2264 + }, + { + "epoch": 3.0901283432729136, + "grad_norm": 0.21023859548137452, + "learning_rate": 1.4649727887316947e-05, + "loss": 0.27, + "num_tokens": 1727451931.0, + "step": 2265 + }, + { + "epoch": 3.091493924231594, + "grad_norm": 0.20164994326479355, + "learning_rate": 1.4645183534996264e-05, + "loss": 0.2727, + "num_tokens": 1728339002.0, + "step": 2266 + }, + { + "epoch": 3.0928595051902743, + "grad_norm": 0.18537715192793544, + "learning_rate": 1.4640638070687948e-05, + "loss": 0.27, + "num_tokens": 1729082821.0, + "step": 2267 + }, + { + "epoch": 3.094225086148955, + "grad_norm": 0.19129862178750195, + "learning_rate": 1.4636091495778615e-05, + "loss": 0.2683, + "num_tokens": 1729888792.0, + "step": 2268 + }, + { + "epoch": 3.0955906671076354, + "grad_norm": 0.20608486159850312, + "learning_rate": 1.4631543811655235e-05, + "loss": 0.2826, + "num_tokens": 1730641414.0, + "step": 2269 + }, + { + "epoch": 3.096956248066316, + "grad_norm": 0.20037329802373388, + "learning_rate": 1.4626995019705106e-05, + "loss": 0.2636, + "num_tokens": 1731357190.0, + "step": 2270 + }, + { + "epoch": 3.0983218290249965, + "grad_norm": 0.2052007138857161, + "learning_rate": 1.462244512131587e-05, + "loss": 0.2711, + "num_tokens": 1732135733.0, + "step": 2271 + }, + { + "epoch": 3.099687409983677, + "grad_norm": 0.20759986867845528, + "learning_rate": 1.4617894117875502e-05, + "loss": 0.2689, + "num_tokens": 1732925347.0, + "step": 2272 + }, + { + "epoch": 3.1010529909423576, + "grad_norm": 0.20173762938712766, + "learning_rate": 1.4613342010772314e-05, + "loss": 0.2625, + "num_tokens": 1733653877.0, + "step": 2273 + }, + { + "epoch": 3.102418571901038, + "grad_norm": 0.21620064090798946, + "learning_rate": 1.460878880139496e-05, + "loss": 0.266, + "num_tokens": 1734409740.0, + "step": 2274 + }, + { + "epoch": 3.1037841528597188, + "grad_norm": 0.1899976534473343, + "learning_rate": 1.4604234491132428e-05, + "loss": 0.2782, + "num_tokens": 1735177761.0, + "step": 2275 + }, + { + "epoch": 3.105149733818399, + "grad_norm": 0.20802884763149862, + "learning_rate": 1.4599679081374036e-05, + "loss": 0.2629, + "num_tokens": 1735921648.0, + "step": 2276 + }, + { + "epoch": 3.1065153147770794, + "grad_norm": 0.19791023976606456, + "learning_rate": 1.4595122573509446e-05, + "loss": 0.2558, + "num_tokens": 1736695076.0, + "step": 2277 + }, + { + "epoch": 3.10788089573576, + "grad_norm": 0.19058118757596232, + "learning_rate": 1.4590564968928651e-05, + "loss": 0.2614, + "num_tokens": 1737542493.0, + "step": 2278 + }, + { + "epoch": 3.1092464766944405, + "grad_norm": 0.1937897011912724, + "learning_rate": 1.4586006269021977e-05, + "loss": 0.2752, + "num_tokens": 1738300508.0, + "step": 2279 + }, + { + "epoch": 3.110612057653121, + "grad_norm": 0.20998352941266432, + "learning_rate": 1.4581446475180088e-05, + "loss": 0.2666, + "num_tokens": 1739104154.0, + "step": 2280 + }, + { + "epoch": 3.1119776386118017, + "grad_norm": 0.20212651580658242, + "learning_rate": 1.457688558879398e-05, + "loss": 0.2728, + "num_tokens": 1739836600.0, + "step": 2281 + }, + { + "epoch": 3.113343219570482, + "grad_norm": 0.2073165129399981, + "learning_rate": 1.4572323611254978e-05, + "loss": 0.2656, + "num_tokens": 1740594005.0, + "step": 2282 + }, + { + "epoch": 3.1147088005291628, + "grad_norm": 0.19481001058483588, + "learning_rate": 1.4567760543954747e-05, + "loss": 0.2687, + "num_tokens": 1741404529.0, + "step": 2283 + }, + { + "epoch": 3.1160743814878433, + "grad_norm": 0.18718907762552472, + "learning_rate": 1.456319638828528e-05, + "loss": 0.2737, + "num_tokens": 1742280480.0, + "step": 2284 + }, + { + "epoch": 3.1174399624465234, + "grad_norm": 0.1924321447505693, + "learning_rate": 1.4558631145638906e-05, + "loss": 0.2501, + "num_tokens": 1743055740.0, + "step": 2285 + }, + { + "epoch": 3.118805543405204, + "grad_norm": 0.19046795790715754, + "learning_rate": 1.455406481740828e-05, + "loss": 0.264, + "num_tokens": 1743842841.0, + "step": 2286 + }, + { + "epoch": 3.1201711243638846, + "grad_norm": 0.18117077662199915, + "learning_rate": 1.4549497404986392e-05, + "loss": 0.2667, + "num_tokens": 1744607456.0, + "step": 2287 + }, + { + "epoch": 3.121536705322565, + "grad_norm": 0.21438378314956888, + "learning_rate": 1.454492890976656e-05, + "loss": 0.2662, + "num_tokens": 1745325118.0, + "step": 2288 + }, + { + "epoch": 3.1229022862812457, + "grad_norm": 0.18141864442326086, + "learning_rate": 1.4540359333142436e-05, + "loss": 0.2745, + "num_tokens": 1746063993.0, + "step": 2289 + }, + { + "epoch": 3.124267867239926, + "grad_norm": 0.20276168547371853, + "learning_rate": 1.4535788676508001e-05, + "loss": 0.2726, + "num_tokens": 1746854453.0, + "step": 2290 + }, + { + "epoch": 3.125633448198607, + "grad_norm": 0.18599996441046254, + "learning_rate": 1.4531216941257563e-05, + "loss": 0.2826, + "num_tokens": 1747702955.0, + "step": 2291 + }, + { + "epoch": 3.1269990291572873, + "grad_norm": 0.2014531676595862, + "learning_rate": 1.4526644128785757e-05, + "loss": 0.2658, + "num_tokens": 1748497979.0, + "step": 2292 + }, + { + "epoch": 3.128364610115968, + "grad_norm": 0.1861971793732089, + "learning_rate": 1.4522070240487554e-05, + "loss": 0.2667, + "num_tokens": 1749194255.0, + "step": 2293 + }, + { + "epoch": 3.129730191074648, + "grad_norm": 0.20896970759347083, + "learning_rate": 1.4517495277758249e-05, + "loss": 0.2671, + "num_tokens": 1749893265.0, + "step": 2294 + }, + { + "epoch": 3.1310957720333286, + "grad_norm": 0.19647917896789985, + "learning_rate": 1.4512919241993464e-05, + "loss": 0.2765, + "num_tokens": 1750646794.0, + "step": 2295 + }, + { + "epoch": 3.132461352992009, + "grad_norm": 0.21683591979593728, + "learning_rate": 1.450834213458915e-05, + "loss": 0.2573, + "num_tokens": 1751373646.0, + "step": 2296 + }, + { + "epoch": 3.1338269339506897, + "grad_norm": 0.18484300579181753, + "learning_rate": 1.4503763956941588e-05, + "loss": 0.2693, + "num_tokens": 1752135850.0, + "step": 2297 + }, + { + "epoch": 3.1351925149093702, + "grad_norm": 0.19595746680646864, + "learning_rate": 1.4499184710447373e-05, + "loss": 0.2759, + "num_tokens": 1752925287.0, + "step": 2298 + }, + { + "epoch": 3.136558095868051, + "grad_norm": 0.18327516046627232, + "learning_rate": 1.4494604396503442e-05, + "loss": 0.2713, + "num_tokens": 1753659189.0, + "step": 2299 + }, + { + "epoch": 3.1379236768267313, + "grad_norm": 0.2111461277787238, + "learning_rate": 1.4490023016507047e-05, + "loss": 0.2878, + "num_tokens": 1754553281.0, + "step": 2300 + }, + { + "epoch": 3.139289257785412, + "grad_norm": 0.1797702702193455, + "learning_rate": 1.4485440571855771e-05, + "loss": 0.2526, + "num_tokens": 1755263921.0, + "step": 2301 + }, + { + "epoch": 3.1406548387440925, + "grad_norm": 0.18523503943669864, + "learning_rate": 1.4480857063947516e-05, + "loss": 0.2724, + "num_tokens": 1755995142.0, + "step": 2302 + }, + { + "epoch": 3.1420204197027726, + "grad_norm": 0.20206616344745992, + "learning_rate": 1.4476272494180516e-05, + "loss": 0.2787, + "num_tokens": 1756821472.0, + "step": 2303 + }, + { + "epoch": 3.143386000661453, + "grad_norm": 0.17951393219051484, + "learning_rate": 1.447168686395332e-05, + "loss": 0.2648, + "num_tokens": 1757594663.0, + "step": 2304 + }, + { + "epoch": 3.1447515816201337, + "grad_norm": 0.19361205421147917, + "learning_rate": 1.4467100174664812e-05, + "loss": 0.265, + "num_tokens": 1758386255.0, + "step": 2305 + }, + { + "epoch": 3.1461171625788142, + "grad_norm": 0.19344878730399623, + "learning_rate": 1.4462512427714183e-05, + "loss": 0.2594, + "num_tokens": 1759156714.0, + "step": 2306 + }, + { + "epoch": 3.147482743537495, + "grad_norm": 0.19904136982215143, + "learning_rate": 1.4457923624500967e-05, + "loss": 0.2649, + "num_tokens": 1759930511.0, + "step": 2307 + }, + { + "epoch": 3.1488483244961754, + "grad_norm": 0.18525687102058397, + "learning_rate": 1.4453333766424999e-05, + "loss": 0.2803, + "num_tokens": 1760701781.0, + "step": 2308 + }, + { + "epoch": 3.150213905454856, + "grad_norm": 0.19989293676980155, + "learning_rate": 1.4448742854886453e-05, + "loss": 0.2711, + "num_tokens": 1761461085.0, + "step": 2309 + }, + { + "epoch": 3.1515794864135365, + "grad_norm": 0.1989199359620941, + "learning_rate": 1.4444150891285809e-05, + "loss": 0.2778, + "num_tokens": 1762219826.0, + "step": 2310 + }, + { + "epoch": 3.152945067372217, + "grad_norm": 0.20635260619112514, + "learning_rate": 1.4439557877023884e-05, + "loss": 0.2627, + "num_tokens": 1762969198.0, + "step": 2311 + }, + { + "epoch": 3.154310648330897, + "grad_norm": 0.20101007515774208, + "learning_rate": 1.4434963813501805e-05, + "loss": 0.2795, + "num_tokens": 1763798499.0, + "step": 2312 + }, + { + "epoch": 3.1556762292895777, + "grad_norm": 0.20200903864651423, + "learning_rate": 1.4430368702121025e-05, + "loss": 0.2729, + "num_tokens": 1764512846.0, + "step": 2313 + }, + { + "epoch": 3.1570418102482583, + "grad_norm": 0.21121387588087298, + "learning_rate": 1.442577254428331e-05, + "loss": 0.2761, + "num_tokens": 1765280950.0, + "step": 2314 + }, + { + "epoch": 3.158407391206939, + "grad_norm": 0.22211445826910664, + "learning_rate": 1.4421175341390748e-05, + "loss": 0.2687, + "num_tokens": 1766006894.0, + "step": 2315 + }, + { + "epoch": 3.1597729721656194, + "grad_norm": 0.20714235361611177, + "learning_rate": 1.4416577094845747e-05, + "loss": 0.284, + "num_tokens": 1766756459.0, + "step": 2316 + }, + { + "epoch": 3.1611385531243, + "grad_norm": 0.20840775425867972, + "learning_rate": 1.4411977806051034e-05, + "loss": 0.2733, + "num_tokens": 1767519194.0, + "step": 2317 + }, + { + "epoch": 3.1625041340829805, + "grad_norm": 0.19170847634261362, + "learning_rate": 1.4407377476409653e-05, + "loss": 0.2793, + "num_tokens": 1768285423.0, + "step": 2318 + }, + { + "epoch": 3.163869715041661, + "grad_norm": 0.19536993777396655, + "learning_rate": 1.4402776107324964e-05, + "loss": 0.2706, + "num_tokens": 1769003459.0, + "step": 2319 + }, + { + "epoch": 3.1652352960003416, + "grad_norm": 0.19983814951734666, + "learning_rate": 1.4398173700200646e-05, + "loss": 0.2659, + "num_tokens": 1769763774.0, + "step": 2320 + }, + { + "epoch": 3.1666008769590217, + "grad_norm": 0.18557608694347913, + "learning_rate": 1.4393570256440699e-05, + "loss": 0.2838, + "num_tokens": 1770502224.0, + "step": 2321 + }, + { + "epoch": 3.1679664579177023, + "grad_norm": 0.20565048084962154, + "learning_rate": 1.4388965777449428e-05, + "loss": 0.2671, + "num_tokens": 1771285274.0, + "step": 2322 + }, + { + "epoch": 3.169332038876383, + "grad_norm": 0.28458540471991706, + "learning_rate": 1.4384360264631458e-05, + "loss": 0.2883, + "num_tokens": 1772115136.0, + "step": 2323 + }, + { + "epoch": 3.1706976198350634, + "grad_norm": 0.2079189114188602, + "learning_rate": 1.437975371939174e-05, + "loss": 0.2604, + "num_tokens": 1772873775.0, + "step": 2324 + }, + { + "epoch": 3.172063200793744, + "grad_norm": 0.20119644810775764, + "learning_rate": 1.4375146143135524e-05, + "loss": 0.2749, + "num_tokens": 1773591314.0, + "step": 2325 + }, + { + "epoch": 3.1734287817524245, + "grad_norm": 0.20406512052702003, + "learning_rate": 1.4370537537268386e-05, + "loss": 0.2735, + "num_tokens": 1774349412.0, + "step": 2326 + }, + { + "epoch": 3.174794362711105, + "grad_norm": 0.20175414011455178, + "learning_rate": 1.436592790319621e-05, + "loss": 0.2772, + "num_tokens": 1775162092.0, + "step": 2327 + }, + { + "epoch": 3.1761599436697856, + "grad_norm": 0.1900146731182938, + "learning_rate": 1.4361317242325199e-05, + "loss": 0.275, + "num_tokens": 1775965500.0, + "step": 2328 + }, + { + "epoch": 3.177525524628466, + "grad_norm": 0.209487503110832, + "learning_rate": 1.4356705556061861e-05, + "loss": 0.2794, + "num_tokens": 1776761987.0, + "step": 2329 + }, + { + "epoch": 3.1788911055871463, + "grad_norm": 0.20249349503618314, + "learning_rate": 1.4352092845813026e-05, + "loss": 0.2796, + "num_tokens": 1777572641.0, + "step": 2330 + }, + { + "epoch": 3.180256686545827, + "grad_norm": 0.2097367881031723, + "learning_rate": 1.4347479112985832e-05, + "loss": 0.2696, + "num_tokens": 1778306091.0, + "step": 2331 + }, + { + "epoch": 3.1816222675045074, + "grad_norm": 0.21131397648090708, + "learning_rate": 1.4342864358987724e-05, + "loss": 0.2801, + "num_tokens": 1779088127.0, + "step": 2332 + }, + { + "epoch": 3.182987848463188, + "grad_norm": 0.19704727665835065, + "learning_rate": 1.4338248585226468e-05, + "loss": 0.2783, + "num_tokens": 1779844476.0, + "step": 2333 + }, + { + "epoch": 3.1843534294218685, + "grad_norm": 0.19629770635218005, + "learning_rate": 1.433363179311014e-05, + "loss": 0.2683, + "num_tokens": 1780636469.0, + "step": 2334 + }, + { + "epoch": 3.185719010380549, + "grad_norm": 0.19324929266346322, + "learning_rate": 1.4329013984047113e-05, + "loss": 0.2726, + "num_tokens": 1781442926.0, + "step": 2335 + }, + { + "epoch": 3.1870845913392296, + "grad_norm": 0.28401242765190365, + "learning_rate": 1.4324395159446091e-05, + "loss": 0.2712, + "num_tokens": 1782217502.0, + "step": 2336 + }, + { + "epoch": 3.18845017229791, + "grad_norm": 0.21936401161232488, + "learning_rate": 1.4319775320716072e-05, + "loss": 0.2729, + "num_tokens": 1782994920.0, + "step": 2337 + }, + { + "epoch": 3.1898157532565907, + "grad_norm": 0.21051432451405766, + "learning_rate": 1.431515446926637e-05, + "loss": 0.2798, + "num_tokens": 1783820902.0, + "step": 2338 + }, + { + "epoch": 3.191181334215271, + "grad_norm": 0.20491197340481876, + "learning_rate": 1.4310532606506606e-05, + "loss": 0.2678, + "num_tokens": 1784466390.0, + "step": 2339 + }, + { + "epoch": 3.1925469151739514, + "grad_norm": 0.20206995950075815, + "learning_rate": 1.4305909733846712e-05, + "loss": 0.2719, + "num_tokens": 1785193712.0, + "step": 2340 + }, + { + "epoch": 3.193912496132632, + "grad_norm": 0.19334300379917063, + "learning_rate": 1.4301285852696925e-05, + "loss": 0.2854, + "num_tokens": 1785983454.0, + "step": 2341 + }, + { + "epoch": 3.1952780770913125, + "grad_norm": 0.21481896418852425, + "learning_rate": 1.429666096446779e-05, + "loss": 0.2911, + "num_tokens": 1786769124.0, + "step": 2342 + }, + { + "epoch": 3.196643658049993, + "grad_norm": 0.20075091837290981, + "learning_rate": 1.429203507057016e-05, + "loss": 0.2791, + "num_tokens": 1787578782.0, + "step": 2343 + }, + { + "epoch": 3.1980092390086736, + "grad_norm": 0.1748558500887479, + "learning_rate": 1.4287408172415196e-05, + "loss": 0.2625, + "num_tokens": 1788348886.0, + "step": 2344 + }, + { + "epoch": 3.199374819967354, + "grad_norm": 0.18902015027805866, + "learning_rate": 1.4282780271414364e-05, + "loss": 0.2706, + "num_tokens": 1789173291.0, + "step": 2345 + }, + { + "epoch": 3.2007404009260347, + "grad_norm": 0.1878271559729813, + "learning_rate": 1.427815136897944e-05, + "loss": 0.2746, + "num_tokens": 1789902962.0, + "step": 2346 + }, + { + "epoch": 3.2021059818847153, + "grad_norm": 0.19244106197322086, + "learning_rate": 1.4273521466522495e-05, + "loss": 0.2632, + "num_tokens": 1790634271.0, + "step": 2347 + }, + { + "epoch": 3.2034715628433954, + "grad_norm": 0.1856597327097042, + "learning_rate": 1.4268890565455915e-05, + "loss": 0.268, + "num_tokens": 1791383433.0, + "step": 2348 + }, + { + "epoch": 3.204837143802076, + "grad_norm": 0.20744931370732766, + "learning_rate": 1.4264258667192387e-05, + "loss": 0.2791, + "num_tokens": 1792093988.0, + "step": 2349 + }, + { + "epoch": 3.2062027247607565, + "grad_norm": 0.20645614812173774, + "learning_rate": 1.4259625773144903e-05, + "loss": 0.262, + "num_tokens": 1792863344.0, + "step": 2350 + }, + { + "epoch": 3.207568305719437, + "grad_norm": 0.18987659061215575, + "learning_rate": 1.4254991884726757e-05, + "loss": 0.2841, + "num_tokens": 1793618958.0, + "step": 2351 + }, + { + "epoch": 3.2089338866781176, + "grad_norm": 0.19647829624502972, + "learning_rate": 1.425035700335155e-05, + "loss": 0.262, + "num_tokens": 1794413563.0, + "step": 2352 + }, + { + "epoch": 3.210299467636798, + "grad_norm": 0.2054099217499659, + "learning_rate": 1.4245721130433186e-05, + "loss": 0.2839, + "num_tokens": 1795183676.0, + "step": 2353 + }, + { + "epoch": 3.2116650485954787, + "grad_norm": 0.1894342294148697, + "learning_rate": 1.4241084267385858e-05, + "loss": 0.2574, + "num_tokens": 1795919763.0, + "step": 2354 + }, + { + "epoch": 3.2130306295541593, + "grad_norm": 0.1945660092639355, + "learning_rate": 1.4236446415624086e-05, + "loss": 0.2566, + "num_tokens": 1796757219.0, + "step": 2355 + }, + { + "epoch": 3.21439621051284, + "grad_norm": 0.18536416046550494, + "learning_rate": 1.4231807576562666e-05, + "loss": 0.2674, + "num_tokens": 1797511233.0, + "step": 2356 + }, + { + "epoch": 3.21576179147152, + "grad_norm": 0.1894574178871485, + "learning_rate": 1.4227167751616713e-05, + "loss": 0.2634, + "num_tokens": 1798315183.0, + "step": 2357 + }, + { + "epoch": 3.2171273724302005, + "grad_norm": 0.1990392713232286, + "learning_rate": 1.4222526942201638e-05, + "loss": 0.2617, + "num_tokens": 1799033399.0, + "step": 2358 + }, + { + "epoch": 3.218492953388881, + "grad_norm": 0.19377607820070814, + "learning_rate": 1.4217885149733144e-05, + "loss": 0.2817, + "num_tokens": 1799850743.0, + "step": 2359 + }, + { + "epoch": 3.2198585343475616, + "grad_norm": 0.22330869409881712, + "learning_rate": 1.421324237562725e-05, + "loss": 0.2747, + "num_tokens": 1800662498.0, + "step": 2360 + }, + { + "epoch": 3.221224115306242, + "grad_norm": 0.1856030499956643, + "learning_rate": 1.420859862130026e-05, + "loss": 0.2717, + "num_tokens": 1801470029.0, + "step": 2361 + }, + { + "epoch": 3.2225896962649228, + "grad_norm": 0.17760080327852162, + "learning_rate": 1.4203953888168785e-05, + "loss": 0.2634, + "num_tokens": 1802182497.0, + "step": 2362 + }, + { + "epoch": 3.2239552772236033, + "grad_norm": 0.20090915754407884, + "learning_rate": 1.4199308177649726e-05, + "loss": 0.2726, + "num_tokens": 1802946743.0, + "step": 2363 + }, + { + "epoch": 3.225320858182284, + "grad_norm": 0.20099890582760246, + "learning_rate": 1.4194661491160296e-05, + "loss": 0.2677, + "num_tokens": 1803698167.0, + "step": 2364 + }, + { + "epoch": 3.2266864391409644, + "grad_norm": 0.18938295677206265, + "learning_rate": 1.4190013830117991e-05, + "loss": 0.2705, + "num_tokens": 1804442908.0, + "step": 2365 + }, + { + "epoch": 3.2280520200996445, + "grad_norm": 0.2127502761146136, + "learning_rate": 1.4185365195940615e-05, + "loss": 0.2648, + "num_tokens": 1805209741.0, + "step": 2366 + }, + { + "epoch": 3.229417601058325, + "grad_norm": 0.1756509205093806, + "learning_rate": 1.4180715590046267e-05, + "loss": 0.2578, + "num_tokens": 1805997003.0, + "step": 2367 + }, + { + "epoch": 3.2307831820170057, + "grad_norm": 0.18894554059244187, + "learning_rate": 1.4176065013853333e-05, + "loss": 0.2567, + "num_tokens": 1806710104.0, + "step": 2368 + }, + { + "epoch": 3.232148762975686, + "grad_norm": 0.1945951109195739, + "learning_rate": 1.417141346878051e-05, + "loss": 0.2735, + "num_tokens": 1807514842.0, + "step": 2369 + }, + { + "epoch": 3.2335143439343668, + "grad_norm": 0.18533993257871448, + "learning_rate": 1.4166760956246779e-05, + "loss": 0.2733, + "num_tokens": 1808263080.0, + "step": 2370 + }, + { + "epoch": 3.2348799248930473, + "grad_norm": 0.19325603496391036, + "learning_rate": 1.4162107477671422e-05, + "loss": 0.2715, + "num_tokens": 1808983353.0, + "step": 2371 + }, + { + "epoch": 3.236245505851728, + "grad_norm": 0.21337066444507233, + "learning_rate": 1.4157453034474013e-05, + "loss": 0.2692, + "num_tokens": 1809756018.0, + "step": 2372 + }, + { + "epoch": 3.2376110868104084, + "grad_norm": 0.1902265953661659, + "learning_rate": 1.415279762807442e-05, + "loss": 0.2734, + "num_tokens": 1810533372.0, + "step": 2373 + }, + { + "epoch": 3.238976667769089, + "grad_norm": 0.19632325918192975, + "learning_rate": 1.4148141259892807e-05, + "loss": 0.262, + "num_tokens": 1811294632.0, + "step": 2374 + }, + { + "epoch": 3.240342248727769, + "grad_norm": 0.1929337668074922, + "learning_rate": 1.4143483931349629e-05, + "loss": 0.2755, + "num_tokens": 1812064769.0, + "step": 2375 + }, + { + "epoch": 3.2417078296864497, + "grad_norm": 0.18730339925908246, + "learning_rate": 1.4138825643865636e-05, + "loss": 0.2642, + "num_tokens": 1812762179.0, + "step": 2376 + }, + { + "epoch": 3.24307341064513, + "grad_norm": 0.20695056031931852, + "learning_rate": 1.4134166398861875e-05, + "loss": 0.2683, + "num_tokens": 1813498263.0, + "step": 2377 + }, + { + "epoch": 3.2444389916038108, + "grad_norm": 0.21467923502621347, + "learning_rate": 1.4129506197759668e-05, + "loss": 0.2726, + "num_tokens": 1814303267.0, + "step": 2378 + }, + { + "epoch": 3.2458045725624913, + "grad_norm": 0.1910362099474822, + "learning_rate": 1.412484504198065e-05, + "loss": 0.2672, + "num_tokens": 1815100521.0, + "step": 2379 + }, + { + "epoch": 3.247170153521172, + "grad_norm": 0.20428481339616017, + "learning_rate": 1.4120182932946735e-05, + "loss": 0.277, + "num_tokens": 1815859424.0, + "step": 2380 + }, + { + "epoch": 3.2485357344798524, + "grad_norm": 0.2030097769393471, + "learning_rate": 1.4115519872080133e-05, + "loss": 0.28, + "num_tokens": 1816711193.0, + "step": 2381 + }, + { + "epoch": 3.249901315438533, + "grad_norm": 0.186672110681332, + "learning_rate": 1.4110855860803336e-05, + "loss": 0.2715, + "num_tokens": 1817520515.0, + "step": 2382 + }, + { + "epoch": 3.2512668963972136, + "grad_norm": 0.2063978254589925, + "learning_rate": 1.4106190900539141e-05, + "loss": 0.272, + "num_tokens": 1818251730.0, + "step": 2383 + }, + { + "epoch": 3.2526324773558937, + "grad_norm": 0.19739164519681418, + "learning_rate": 1.4101524992710616e-05, + "loss": 0.2661, + "num_tokens": 1819001705.0, + "step": 2384 + }, + { + "epoch": 3.2539980583145742, + "grad_norm": 0.18905795610669557, + "learning_rate": 1.409685813874113e-05, + "loss": 0.2824, + "num_tokens": 1819754800.0, + "step": 2385 + }, + { + "epoch": 3.255363639273255, + "grad_norm": 0.20196025312584504, + "learning_rate": 1.4092190340054344e-05, + "loss": 0.2717, + "num_tokens": 1820516845.0, + "step": 2386 + }, + { + "epoch": 3.2567292202319353, + "grad_norm": 0.2079661419465257, + "learning_rate": 1.4087521598074195e-05, + "loss": 0.2808, + "num_tokens": 1821280716.0, + "step": 2387 + }, + { + "epoch": 3.258094801190616, + "grad_norm": 0.20596046428315215, + "learning_rate": 1.4082851914224913e-05, + "loss": 0.28, + "num_tokens": 1822024802.0, + "step": 2388 + }, + { + "epoch": 3.2594603821492965, + "grad_norm": 0.20696138524936541, + "learning_rate": 1.4078181289931021e-05, + "loss": 0.2797, + "num_tokens": 1822815644.0, + "step": 2389 + }, + { + "epoch": 3.260825963107977, + "grad_norm": 0.1972246795085495, + "learning_rate": 1.407350972661732e-05, + "loss": 0.2588, + "num_tokens": 1823567588.0, + "step": 2390 + }, + { + "epoch": 3.2621915440666576, + "grad_norm": 0.20894970870859864, + "learning_rate": 1.4068837225708904e-05, + "loss": 0.2726, + "num_tokens": 1824278408.0, + "step": 2391 + }, + { + "epoch": 3.263557125025338, + "grad_norm": 0.20751431948978596, + "learning_rate": 1.4064163788631153e-05, + "loss": 0.2747, + "num_tokens": 1825024934.0, + "step": 2392 + }, + { + "epoch": 3.2649227059840182, + "grad_norm": 0.19778436940820787, + "learning_rate": 1.4059489416809729e-05, + "loss": 0.2866, + "num_tokens": 1825789539.0, + "step": 2393 + }, + { + "epoch": 3.266288286942699, + "grad_norm": 0.21734713939537242, + "learning_rate": 1.4054814111670575e-05, + "loss": 0.2669, + "num_tokens": 1826550378.0, + "step": 2394 + }, + { + "epoch": 3.2676538679013793, + "grad_norm": 0.19293344811952878, + "learning_rate": 1.405013787463993e-05, + "loss": 0.2521, + "num_tokens": 1827299610.0, + "step": 2395 + }, + { + "epoch": 3.26901944886006, + "grad_norm": 0.19622870374747675, + "learning_rate": 1.4045460707144311e-05, + "loss": 0.2663, + "num_tokens": 1828107717.0, + "step": 2396 + }, + { + "epoch": 3.2703850298187405, + "grad_norm": 0.1902307529714498, + "learning_rate": 1.4040782610610516e-05, + "loss": 0.2777, + "num_tokens": 1828902636.0, + "step": 2397 + }, + { + "epoch": 3.271750610777421, + "grad_norm": 0.2088105453250827, + "learning_rate": 1.4036103586465632e-05, + "loss": 0.2755, + "num_tokens": 1829627314.0, + "step": 2398 + }, + { + "epoch": 3.2731161917361016, + "grad_norm": 0.18934224697762408, + "learning_rate": 1.4031423636137028e-05, + "loss": 0.2701, + "num_tokens": 1830274481.0, + "step": 2399 + }, + { + "epoch": 3.274481772694782, + "grad_norm": 0.2320238836704446, + "learning_rate": 1.4026742761052353e-05, + "loss": 0.2794, + "num_tokens": 1831045697.0, + "step": 2400 + }, + { + "epoch": 3.2758473536534627, + "grad_norm": 0.21981480376341683, + "learning_rate": 1.4022060962639534e-05, + "loss": 0.2627, + "num_tokens": 1831762032.0, + "step": 2401 + }, + { + "epoch": 3.277212934612143, + "grad_norm": 0.18903806305561144, + "learning_rate": 1.4017378242326794e-05, + "loss": 0.2687, + "num_tokens": 1832574196.0, + "step": 2402 + }, + { + "epoch": 3.2785785155708234, + "grad_norm": 0.1856954897004164, + "learning_rate": 1.401269460154262e-05, + "loss": 0.2737, + "num_tokens": 1833308461.0, + "step": 2403 + }, + { + "epoch": 3.279944096529504, + "grad_norm": 0.20026380157428866, + "learning_rate": 1.4008010041715796e-05, + "loss": 0.2621, + "num_tokens": 1834044395.0, + "step": 2404 + }, + { + "epoch": 3.2813096774881845, + "grad_norm": 0.1967556879550702, + "learning_rate": 1.4003324564275371e-05, + "loss": 0.271, + "num_tokens": 1834811463.0, + "step": 2405 + }, + { + "epoch": 3.282675258446865, + "grad_norm": 0.1894988566107046, + "learning_rate": 1.3998638170650685e-05, + "loss": 0.2811, + "num_tokens": 1835535399.0, + "step": 2406 + }, + { + "epoch": 3.2840408394055456, + "grad_norm": 0.20507101297566238, + "learning_rate": 1.3993950862271355e-05, + "loss": 0.2835, + "num_tokens": 1836314807.0, + "step": 2407 + }, + { + "epoch": 3.285406420364226, + "grad_norm": 0.19607694201006956, + "learning_rate": 1.3989262640567273e-05, + "loss": 0.2904, + "num_tokens": 1837137603.0, + "step": 2408 + }, + { + "epoch": 3.2867720013229067, + "grad_norm": 0.217607442685979, + "learning_rate": 1.3984573506968614e-05, + "loss": 0.2703, + "num_tokens": 1837951755.0, + "step": 2409 + }, + { + "epoch": 3.2881375822815873, + "grad_norm": 0.18462937823399556, + "learning_rate": 1.3979883462905825e-05, + "loss": 0.2648, + "num_tokens": 1838738364.0, + "step": 2410 + }, + { + "epoch": 3.2895031632402674, + "grad_norm": 0.19751045746999715, + "learning_rate": 1.3975192509809644e-05, + "loss": 0.2814, + "num_tokens": 1839543067.0, + "step": 2411 + }, + { + "epoch": 3.290868744198948, + "grad_norm": 0.18789992618267778, + "learning_rate": 1.397050064911107e-05, + "loss": 0.2764, + "num_tokens": 1840324419.0, + "step": 2412 + }, + { + "epoch": 3.2922343251576285, + "grad_norm": 0.19088428721500467, + "learning_rate": 1.396580788224139e-05, + "loss": 0.2688, + "num_tokens": 1841045468.0, + "step": 2413 + }, + { + "epoch": 3.293599906116309, + "grad_norm": 0.20203068783355285, + "learning_rate": 1.3961114210632163e-05, + "loss": 0.2706, + "num_tokens": 1841771222.0, + "step": 2414 + }, + { + "epoch": 3.2949654870749896, + "grad_norm": 0.1974406144672401, + "learning_rate": 1.3956419635715225e-05, + "loss": 0.266, + "num_tokens": 1842507358.0, + "step": 2415 + }, + { + "epoch": 3.29633106803367, + "grad_norm": 0.20496574393735745, + "learning_rate": 1.3951724158922691e-05, + "loss": 0.2731, + "num_tokens": 1843281092.0, + "step": 2416 + }, + { + "epoch": 3.2976966489923507, + "grad_norm": 0.19216996053962682, + "learning_rate": 1.3947027781686942e-05, + "loss": 0.2713, + "num_tokens": 1844043217.0, + "step": 2417 + }, + { + "epoch": 3.2990622299510313, + "grad_norm": 0.19858536354206732, + "learning_rate": 1.3942330505440642e-05, + "loss": 0.2774, + "num_tokens": 1844747286.0, + "step": 2418 + }, + { + "epoch": 3.300427810909712, + "grad_norm": 0.19381621347966296, + "learning_rate": 1.3937632331616725e-05, + "loss": 0.2757, + "num_tokens": 1845555121.0, + "step": 2419 + }, + { + "epoch": 3.301793391868392, + "grad_norm": 0.20715845541118993, + "learning_rate": 1.3932933261648403e-05, + "loss": 0.2787, + "num_tokens": 1846294143.0, + "step": 2420 + }, + { + "epoch": 3.3031589728270725, + "grad_norm": 0.20096629358001156, + "learning_rate": 1.3928233296969155e-05, + "loss": 0.2629, + "num_tokens": 1847059883.0, + "step": 2421 + }, + { + "epoch": 3.304524553785753, + "grad_norm": 0.17963840500325345, + "learning_rate": 1.3923532439012742e-05, + "loss": 0.26, + "num_tokens": 1847836713.0, + "step": 2422 + }, + { + "epoch": 3.3058901347444336, + "grad_norm": 0.1900830619021606, + "learning_rate": 1.391883068921319e-05, + "loss": 0.2775, + "num_tokens": 1848613558.0, + "step": 2423 + }, + { + "epoch": 3.307255715703114, + "grad_norm": 0.17813354642806067, + "learning_rate": 1.3914128049004797e-05, + "loss": 0.2825, + "num_tokens": 1849411909.0, + "step": 2424 + }, + { + "epoch": 3.3086212966617947, + "grad_norm": 0.1954031959867918, + "learning_rate": 1.3909424519822136e-05, + "loss": 0.2647, + "num_tokens": 1850210919.0, + "step": 2425 + }, + { + "epoch": 3.3099868776204753, + "grad_norm": 0.19952271418198922, + "learning_rate": 1.3904720103100051e-05, + "loss": 0.2576, + "num_tokens": 1850908224.0, + "step": 2426 + }, + { + "epoch": 3.311352458579156, + "grad_norm": 0.1896737061869976, + "learning_rate": 1.3900014800273656e-05, + "loss": 0.2574, + "num_tokens": 1851682845.0, + "step": 2427 + }, + { + "epoch": 3.3127180395378364, + "grad_norm": 0.1938494565211211, + "learning_rate": 1.3895308612778333e-05, + "loss": 0.2777, + "num_tokens": 1852463298.0, + "step": 2428 + }, + { + "epoch": 3.3140836204965165, + "grad_norm": 0.21082332608871743, + "learning_rate": 1.3890601542049737e-05, + "loss": 0.2712, + "num_tokens": 1853227973.0, + "step": 2429 + }, + { + "epoch": 3.315449201455197, + "grad_norm": 0.18966939316155573, + "learning_rate": 1.3885893589523794e-05, + "loss": 0.2734, + "num_tokens": 1853999112.0, + "step": 2430 + }, + { + "epoch": 3.3168147824138776, + "grad_norm": 0.19628588546367426, + "learning_rate": 1.3881184756636691e-05, + "loss": 0.2539, + "num_tokens": 1854747751.0, + "step": 2431 + }, + { + "epoch": 3.318180363372558, + "grad_norm": 0.1790324785243348, + "learning_rate": 1.3876475044824897e-05, + "loss": 0.2781, + "num_tokens": 1855490157.0, + "step": 2432 + }, + { + "epoch": 3.3195459443312387, + "grad_norm": 0.19526522859432022, + "learning_rate": 1.3871764455525134e-05, + "loss": 0.2661, + "num_tokens": 1856274547.0, + "step": 2433 + }, + { + "epoch": 3.3209115252899193, + "grad_norm": 0.19259644863012654, + "learning_rate": 1.3867052990174399e-05, + "loss": 0.2714, + "num_tokens": 1856990190.0, + "step": 2434 + }, + { + "epoch": 3.3222771062486, + "grad_norm": 0.21815807432967999, + "learning_rate": 1.3862340650209962e-05, + "loss": 0.2667, + "num_tokens": 1857781952.0, + "step": 2435 + }, + { + "epoch": 3.3236426872072804, + "grad_norm": 0.18692719282055642, + "learning_rate": 1.3857627437069348e-05, + "loss": 0.2916, + "num_tokens": 1858632728.0, + "step": 2436 + }, + { + "epoch": 3.325008268165961, + "grad_norm": 0.19516290912115158, + "learning_rate": 1.3852913352190356e-05, + "loss": 0.2744, + "num_tokens": 1859407217.0, + "step": 2437 + }, + { + "epoch": 3.326373849124641, + "grad_norm": 0.20362220225490232, + "learning_rate": 1.384819839701105e-05, + "loss": 0.2696, + "num_tokens": 1860132185.0, + "step": 2438 + }, + { + "epoch": 3.3277394300833216, + "grad_norm": 0.1943262438276806, + "learning_rate": 1.384348257296976e-05, + "loss": 0.2838, + "num_tokens": 1860871687.0, + "step": 2439 + }, + { + "epoch": 3.329105011042002, + "grad_norm": 0.2296024867260925, + "learning_rate": 1.3838765881505074e-05, + "loss": 0.2804, + "num_tokens": 1861721303.0, + "step": 2440 + }, + { + "epoch": 3.3304705920006827, + "grad_norm": 0.2012551643836199, + "learning_rate": 1.3834048324055857e-05, + "loss": 0.2794, + "num_tokens": 1862521097.0, + "step": 2441 + }, + { + "epoch": 3.3318361729593633, + "grad_norm": 0.20123997884403427, + "learning_rate": 1.3829329902061226e-05, + "loss": 0.27, + "num_tokens": 1863338232.0, + "step": 2442 + }, + { + "epoch": 3.333201753918044, + "grad_norm": 0.1942542322580485, + "learning_rate": 1.3824610616960573e-05, + "loss": 0.2597, + "num_tokens": 1864055409.0, + "step": 2443 + }, + { + "epoch": 3.3345673348767244, + "grad_norm": 0.21029191526714253, + "learning_rate": 1.3819890470193543e-05, + "loss": 0.2662, + "num_tokens": 1864806238.0, + "step": 2444 + }, + { + "epoch": 3.335932915835405, + "grad_norm": 0.19476125011994028, + "learning_rate": 1.3815169463200046e-05, + "loss": 0.2701, + "num_tokens": 1865484777.0, + "step": 2445 + }, + { + "epoch": 3.3372984967940855, + "grad_norm": 0.2047422994765326, + "learning_rate": 1.3810447597420262e-05, + "loss": 0.2677, + "num_tokens": 1866297124.0, + "step": 2446 + }, + { + "epoch": 3.3386640777527656, + "grad_norm": 0.20260875584191437, + "learning_rate": 1.3805724874294628e-05, + "loss": 0.2682, + "num_tokens": 1867014965.0, + "step": 2447 + }, + { + "epoch": 3.340029658711446, + "grad_norm": 0.19322247646533522, + "learning_rate": 1.3801001295263842e-05, + "loss": 0.275, + "num_tokens": 1867800592.0, + "step": 2448 + }, + { + "epoch": 3.3413952396701267, + "grad_norm": 0.19475628367282763, + "learning_rate": 1.3796276861768859e-05, + "loss": 0.2695, + "num_tokens": 1868551776.0, + "step": 2449 + }, + { + "epoch": 3.3427608206288073, + "grad_norm": 0.20169251895697923, + "learning_rate": 1.3791551575250901e-05, + "loss": 0.2721, + "num_tokens": 1869333305.0, + "step": 2450 + }, + { + "epoch": 3.344126401587488, + "grad_norm": 0.1854383161796759, + "learning_rate": 1.3786825437151453e-05, + "loss": 0.2761, + "num_tokens": 1870111426.0, + "step": 2451 + }, + { + "epoch": 3.3454919825461684, + "grad_norm": 0.1902383091227898, + "learning_rate": 1.3782098448912247e-05, + "loss": 0.28, + "num_tokens": 1870907351.0, + "step": 2452 + }, + { + "epoch": 3.346857563504849, + "grad_norm": 0.20149107053487655, + "learning_rate": 1.3777370611975291e-05, + "loss": 0.2706, + "num_tokens": 1871697360.0, + "step": 2453 + }, + { + "epoch": 3.3482231444635295, + "grad_norm": 0.18943868323894744, + "learning_rate": 1.3772641927782836e-05, + "loss": 0.2699, + "num_tokens": 1872447283.0, + "step": 2454 + }, + { + "epoch": 3.34958872542221, + "grad_norm": 0.20753744857512585, + "learning_rate": 1.3767912397777403e-05, + "loss": 0.2707, + "num_tokens": 1873155344.0, + "step": 2455 + }, + { + "epoch": 3.35095430638089, + "grad_norm": 0.2111458906802961, + "learning_rate": 1.3763182023401765e-05, + "loss": 0.2772, + "num_tokens": 1873997082.0, + "step": 2456 + }, + { + "epoch": 3.3523198873395708, + "grad_norm": 0.20664780080299744, + "learning_rate": 1.3758450806098956e-05, + "loss": 0.2805, + "num_tokens": 1874759460.0, + "step": 2457 + }, + { + "epoch": 3.3536854682982513, + "grad_norm": 0.20519094498415677, + "learning_rate": 1.3753718747312261e-05, + "loss": 0.2816, + "num_tokens": 1875534961.0, + "step": 2458 + }, + { + "epoch": 3.355051049256932, + "grad_norm": 0.1930402388354139, + "learning_rate": 1.3748985848485232e-05, + "loss": 0.2758, + "num_tokens": 1876285425.0, + "step": 2459 + }, + { + "epoch": 3.3564166302156124, + "grad_norm": 0.19042022777565823, + "learning_rate": 1.3744252111061671e-05, + "loss": 0.2676, + "num_tokens": 1876996711.0, + "step": 2460 + }, + { + "epoch": 3.357782211174293, + "grad_norm": 0.20789373696126046, + "learning_rate": 1.373951753648563e-05, + "loss": 0.2747, + "num_tokens": 1877840940.0, + "step": 2461 + }, + { + "epoch": 3.3591477921329735, + "grad_norm": 0.18976215866525817, + "learning_rate": 1.3734782126201431e-05, + "loss": 0.2739, + "num_tokens": 1878639306.0, + "step": 2462 + }, + { + "epoch": 3.360513373091654, + "grad_norm": 0.2053641138825127, + "learning_rate": 1.3730045881653637e-05, + "loss": 0.2724, + "num_tokens": 1879304301.0, + "step": 2463 + }, + { + "epoch": 3.3618789540503347, + "grad_norm": 0.1938436681002965, + "learning_rate": 1.3725308804287075e-05, + "loss": 0.2765, + "num_tokens": 1880167411.0, + "step": 2464 + }, + { + "epoch": 3.3632445350090148, + "grad_norm": 0.19896691433652267, + "learning_rate": 1.3720570895546814e-05, + "loss": 0.2678, + "num_tokens": 1880951109.0, + "step": 2465 + }, + { + "epoch": 3.3646101159676953, + "grad_norm": 0.20641809136092917, + "learning_rate": 1.3715832156878197e-05, + "loss": 0.2742, + "num_tokens": 1881693312.0, + "step": 2466 + }, + { + "epoch": 3.365975696926376, + "grad_norm": 0.1868806566599478, + "learning_rate": 1.37110925897268e-05, + "loss": 0.2681, + "num_tokens": 1882440139.0, + "step": 2467 + }, + { + "epoch": 3.3673412778850564, + "grad_norm": 0.19087987163746697, + "learning_rate": 1.3706352195538457e-05, + "loss": 0.2788, + "num_tokens": 1883271173.0, + "step": 2468 + }, + { + "epoch": 3.368706858843737, + "grad_norm": 0.19728181390038432, + "learning_rate": 1.3701610975759267e-05, + "loss": 0.2695, + "num_tokens": 1884033818.0, + "step": 2469 + }, + { + "epoch": 3.3700724398024176, + "grad_norm": 0.19744136807443977, + "learning_rate": 1.3696868931835563e-05, + "loss": 0.2749, + "num_tokens": 1884807066.0, + "step": 2470 + }, + { + "epoch": 3.371438020761098, + "grad_norm": 0.20298352797963695, + "learning_rate": 1.3692126065213939e-05, + "loss": 0.2714, + "num_tokens": 1885592805.0, + "step": 2471 + }, + { + "epoch": 3.3728036017197787, + "grad_norm": 0.19179462600346447, + "learning_rate": 1.3687382377341241e-05, + "loss": 0.284, + "num_tokens": 1886496945.0, + "step": 2472 + }, + { + "epoch": 3.3741691826784592, + "grad_norm": 0.1917417755239768, + "learning_rate": 1.3682637869664556e-05, + "loss": 0.275, + "num_tokens": 1887261885.0, + "step": 2473 + }, + { + "epoch": 3.3755347636371393, + "grad_norm": 0.20338183972265308, + "learning_rate": 1.3677892543631234e-05, + "loss": 0.2664, + "num_tokens": 1887965663.0, + "step": 2474 + }, + { + "epoch": 3.37690034459582, + "grad_norm": 0.1939924387253658, + "learning_rate": 1.3673146400688868e-05, + "loss": 0.2695, + "num_tokens": 1888744626.0, + "step": 2475 + }, + { + "epoch": 3.3782659255545004, + "grad_norm": 0.18963691754621137, + "learning_rate": 1.3668399442285299e-05, + "loss": 0.2702, + "num_tokens": 1889487824.0, + "step": 2476 + }, + { + "epoch": 3.379631506513181, + "grad_norm": 0.19401112564812104, + "learning_rate": 1.3663651669868614e-05, + "loss": 0.266, + "num_tokens": 1890221171.0, + "step": 2477 + }, + { + "epoch": 3.3809970874718616, + "grad_norm": 0.17970366470464838, + "learning_rate": 1.3658903084887162e-05, + "loss": 0.2841, + "num_tokens": 1890986458.0, + "step": 2478 + }, + { + "epoch": 3.382362668430542, + "grad_norm": 0.209819324660752, + "learning_rate": 1.3654153688789529e-05, + "loss": 0.2678, + "num_tokens": 1891726386.0, + "step": 2479 + }, + { + "epoch": 3.3837282493892227, + "grad_norm": 0.19468479445014905, + "learning_rate": 1.3649403483024541e-05, + "loss": 0.2775, + "num_tokens": 1892403062.0, + "step": 2480 + }, + { + "epoch": 3.3850938303479032, + "grad_norm": 0.19372877877570702, + "learning_rate": 1.3644652469041291e-05, + "loss": 0.2818, + "num_tokens": 1893140884.0, + "step": 2481 + }, + { + "epoch": 3.386459411306584, + "grad_norm": 0.21671291637107462, + "learning_rate": 1.36399006482891e-05, + "loss": 0.2748, + "num_tokens": 1893883172.0, + "step": 2482 + }, + { + "epoch": 3.387824992265264, + "grad_norm": 0.1962409281487731, + "learning_rate": 1.3635148022217544e-05, + "loss": 0.2699, + "num_tokens": 1894656447.0, + "step": 2483 + }, + { + "epoch": 3.3891905732239445, + "grad_norm": 0.18812391806010237, + "learning_rate": 1.3630394592276451e-05, + "loss": 0.2675, + "num_tokens": 1895475016.0, + "step": 2484 + }, + { + "epoch": 3.390556154182625, + "grad_norm": 0.19683642602932855, + "learning_rate": 1.3625640359915873e-05, + "loss": 0.2727, + "num_tokens": 1896285167.0, + "step": 2485 + }, + { + "epoch": 3.3919217351413056, + "grad_norm": 0.179649493662047, + "learning_rate": 1.3620885326586136e-05, + "loss": 0.2651, + "num_tokens": 1897060389.0, + "step": 2486 + }, + { + "epoch": 3.393287316099986, + "grad_norm": 0.19712674329729446, + "learning_rate": 1.361612949373778e-05, + "loss": 0.2844, + "num_tokens": 1897907424.0, + "step": 2487 + }, + { + "epoch": 3.3946528970586667, + "grad_norm": 0.19348003716670728, + "learning_rate": 1.3611372862821614e-05, + "loss": 0.2753, + "num_tokens": 1898709519.0, + "step": 2488 + }, + { + "epoch": 3.3960184780173472, + "grad_norm": 0.1874047937541052, + "learning_rate": 1.3606615435288675e-05, + "loss": 0.2763, + "num_tokens": 1899368170.0, + "step": 2489 + }, + { + "epoch": 3.397384058976028, + "grad_norm": 0.20103156215225343, + "learning_rate": 1.3601857212590249e-05, + "loss": 0.2707, + "num_tokens": 1900133953.0, + "step": 2490 + }, + { + "epoch": 3.3987496399347084, + "grad_norm": 0.2007663818279447, + "learning_rate": 1.3597098196177866e-05, + "loss": 0.2686, + "num_tokens": 1900916871.0, + "step": 2491 + }, + { + "epoch": 3.4001152208933885, + "grad_norm": 0.1913154928112898, + "learning_rate": 1.3592338387503295e-05, + "loss": 0.2838, + "num_tokens": 1901743286.0, + "step": 2492 + }, + { + "epoch": 3.401480801852069, + "grad_norm": 0.18371347394543572, + "learning_rate": 1.3587577788018545e-05, + "loss": 0.2755, + "num_tokens": 1902545155.0, + "step": 2493 + }, + { + "epoch": 3.4028463828107496, + "grad_norm": 0.20021289993918298, + "learning_rate": 1.358281639917587e-05, + "loss": 0.2613, + "num_tokens": 1903277404.0, + "step": 2494 + }, + { + "epoch": 3.40421196376943, + "grad_norm": 0.19384550242090315, + "learning_rate": 1.3578054222427764e-05, + "loss": 0.2528, + "num_tokens": 1904004212.0, + "step": 2495 + }, + { + "epoch": 3.4055775447281107, + "grad_norm": 0.19035716421672197, + "learning_rate": 1.3573291259226959e-05, + "loss": 0.265, + "num_tokens": 1904686245.0, + "step": 2496 + }, + { + "epoch": 3.4069431256867913, + "grad_norm": 0.20240596964074745, + "learning_rate": 1.3568527511026435e-05, + "loss": 0.2596, + "num_tokens": 1905402694.0, + "step": 2497 + }, + { + "epoch": 3.408308706645472, + "grad_norm": 0.19297427680815182, + "learning_rate": 1.3563762979279398e-05, + "loss": 0.2557, + "num_tokens": 1906137761.0, + "step": 2498 + }, + { + "epoch": 3.4096742876041524, + "grad_norm": 0.172725618734487, + "learning_rate": 1.3558997665439302e-05, + "loss": 0.2848, + "num_tokens": 1906983650.0, + "step": 2499 + }, + { + "epoch": 3.411039868562833, + "grad_norm": 0.20377964053038924, + "learning_rate": 1.3554231570959842e-05, + "loss": 0.2704, + "num_tokens": 1907718910.0, + "step": 2500 + }, + { + "epoch": 3.412405449521513, + "grad_norm": 0.20708230060795674, + "learning_rate": 1.354946469729494e-05, + "loss": 0.2885, + "num_tokens": 1908476952.0, + "step": 2501 + }, + { + "epoch": 3.4137710304801936, + "grad_norm": 0.2029456493004474, + "learning_rate": 1.3544697045898773e-05, + "loss": 0.2757, + "num_tokens": 1909207694.0, + "step": 2502 + }, + { + "epoch": 3.415136611438874, + "grad_norm": 0.2027907367834815, + "learning_rate": 1.3539928618225737e-05, + "loss": 0.2722, + "num_tokens": 1909966790.0, + "step": 2503 + }, + { + "epoch": 3.4165021923975547, + "grad_norm": 0.18771884084846815, + "learning_rate": 1.3535159415730472e-05, + "loss": 0.263, + "num_tokens": 1910756122.0, + "step": 2504 + }, + { + "epoch": 3.4178677733562353, + "grad_norm": 0.1720881833827725, + "learning_rate": 1.3530389439867857e-05, + "loss": 0.2807, + "num_tokens": 1911550831.0, + "step": 2505 + }, + { + "epoch": 3.419233354314916, + "grad_norm": 0.19301820060729827, + "learning_rate": 1.352561869209301e-05, + "loss": 0.2675, + "num_tokens": 1912321032.0, + "step": 2506 + }, + { + "epoch": 3.4205989352735964, + "grad_norm": 0.19378443466087455, + "learning_rate": 1.352084717386127e-05, + "loss": 0.2714, + "num_tokens": 1913045546.0, + "step": 2507 + }, + { + "epoch": 3.421964516232277, + "grad_norm": 0.18576755896524016, + "learning_rate": 1.3516074886628227e-05, + "loss": 0.2712, + "num_tokens": 1913843770.0, + "step": 2508 + }, + { + "epoch": 3.4233300971909575, + "grad_norm": 0.18615933562394205, + "learning_rate": 1.3511301831849702e-05, + "loss": 0.2734, + "num_tokens": 1914663045.0, + "step": 2509 + }, + { + "epoch": 3.4246956781496376, + "grad_norm": 0.2010388051258587, + "learning_rate": 1.3506528010981742e-05, + "loss": 0.2759, + "num_tokens": 1915374848.0, + "step": 2510 + }, + { + "epoch": 3.426061259108318, + "grad_norm": 0.20805550464081904, + "learning_rate": 1.3501753425480631e-05, + "loss": 0.2751, + "num_tokens": 1916150049.0, + "step": 2511 + }, + { + "epoch": 3.4274268400669987, + "grad_norm": 0.18851166363777921, + "learning_rate": 1.3496978076802896e-05, + "loss": 0.2671, + "num_tokens": 1916878480.0, + "step": 2512 + }, + { + "epoch": 3.4287924210256793, + "grad_norm": 0.19383299677484198, + "learning_rate": 1.349220196640528e-05, + "loss": 0.2709, + "num_tokens": 1917643468.0, + "step": 2513 + }, + { + "epoch": 3.43015800198436, + "grad_norm": 0.19555487284818668, + "learning_rate": 1.3487425095744775e-05, + "loss": 0.2667, + "num_tokens": 1918402807.0, + "step": 2514 + }, + { + "epoch": 3.4315235829430404, + "grad_norm": 0.17720645845372424, + "learning_rate": 1.3482647466278595e-05, + "loss": 0.2785, + "num_tokens": 1919108193.0, + "step": 2515 + }, + { + "epoch": 3.432889163901721, + "grad_norm": 0.2085794357080601, + "learning_rate": 1.3477869079464185e-05, + "loss": 0.2799, + "num_tokens": 1919920785.0, + "step": 2516 + }, + { + "epoch": 3.4342547448604015, + "grad_norm": 0.20141684231959586, + "learning_rate": 1.3473089936759227e-05, + "loss": 0.2726, + "num_tokens": 1920675676.0, + "step": 2517 + }, + { + "epoch": 3.435620325819082, + "grad_norm": 0.20117236436187297, + "learning_rate": 1.3468310039621633e-05, + "loss": 0.2789, + "num_tokens": 1921393551.0, + "step": 2518 + }, + { + "epoch": 3.436985906777762, + "grad_norm": 0.18359171538845093, + "learning_rate": 1.3463529389509537e-05, + "loss": 0.2757, + "num_tokens": 1922151515.0, + "step": 2519 + }, + { + "epoch": 3.4383514877364427, + "grad_norm": 0.1977178753223143, + "learning_rate": 1.3458747987881312e-05, + "loss": 0.2642, + "num_tokens": 1922938495.0, + "step": 2520 + }, + { + "epoch": 3.4397170686951233, + "grad_norm": 0.18538075094600875, + "learning_rate": 1.3453965836195553e-05, + "loss": 0.2552, + "num_tokens": 1923678478.0, + "step": 2521 + }, + { + "epoch": 3.441082649653804, + "grad_norm": 0.1950919222247887, + "learning_rate": 1.3449182935911088e-05, + "loss": 0.2599, + "num_tokens": 1924441622.0, + "step": 2522 + }, + { + "epoch": 3.4424482306124844, + "grad_norm": 0.1882237133241943, + "learning_rate": 1.3444399288486977e-05, + "loss": 0.2686, + "num_tokens": 1925285595.0, + "step": 2523 + }, + { + "epoch": 3.443813811571165, + "grad_norm": 0.19152197216924946, + "learning_rate": 1.3439614895382502e-05, + "loss": 0.2844, + "num_tokens": 1926034125.0, + "step": 2524 + }, + { + "epoch": 3.4451793925298455, + "grad_norm": 0.20280915368768077, + "learning_rate": 1.3434829758057172e-05, + "loss": 0.2779, + "num_tokens": 1926784034.0, + "step": 2525 + }, + { + "epoch": 3.446544973488526, + "grad_norm": 0.19642324265407612, + "learning_rate": 1.3430043877970727e-05, + "loss": 0.2666, + "num_tokens": 1927584954.0, + "step": 2526 + }, + { + "epoch": 3.4479105544472066, + "grad_norm": 0.20032063623894614, + "learning_rate": 1.3425257256583128e-05, + "loss": 0.2707, + "num_tokens": 1928355876.0, + "step": 2527 + }, + { + "epoch": 3.4492761354058867, + "grad_norm": 0.1975555915710422, + "learning_rate": 1.3420469895354572e-05, + "loss": 0.2698, + "num_tokens": 1929099433.0, + "step": 2528 + }, + { + "epoch": 3.4506417163645673, + "grad_norm": 0.21009271639145913, + "learning_rate": 1.3415681795745472e-05, + "loss": 0.2567, + "num_tokens": 1929793860.0, + "step": 2529 + }, + { + "epoch": 3.452007297323248, + "grad_norm": 0.19852528194959224, + "learning_rate": 1.3410892959216471e-05, + "loss": 0.274, + "num_tokens": 1930543005.0, + "step": 2530 + }, + { + "epoch": 3.4533728782819284, + "grad_norm": 0.2040496950466502, + "learning_rate": 1.3406103387228434e-05, + "loss": 0.2765, + "num_tokens": 1931272603.0, + "step": 2531 + }, + { + "epoch": 3.454738459240609, + "grad_norm": 0.19307050671548423, + "learning_rate": 1.340131308124245e-05, + "loss": 0.2623, + "num_tokens": 1932043592.0, + "step": 2532 + }, + { + "epoch": 3.4561040401992895, + "grad_norm": 0.20658651738071773, + "learning_rate": 1.3396522042719844e-05, + "loss": 0.2858, + "num_tokens": 1932771613.0, + "step": 2533 + }, + { + "epoch": 3.45746962115797, + "grad_norm": 0.2036538194352677, + "learning_rate": 1.3391730273122143e-05, + "loss": 0.2665, + "num_tokens": 1933545792.0, + "step": 2534 + }, + { + "epoch": 3.4588352021166506, + "grad_norm": 0.1920918659159465, + "learning_rate": 1.338693777391111e-05, + "loss": 0.2629, + "num_tokens": 1934247075.0, + "step": 2535 + }, + { + "epoch": 3.460200783075331, + "grad_norm": 0.1943059498371593, + "learning_rate": 1.3382144546548737e-05, + "loss": 0.2596, + "num_tokens": 1935038575.0, + "step": 2536 + }, + { + "epoch": 3.4615663640340113, + "grad_norm": 0.19318730096807168, + "learning_rate": 1.3377350592497222e-05, + "loss": 0.2706, + "num_tokens": 1935828880.0, + "step": 2537 + }, + { + "epoch": 3.462931944992692, + "grad_norm": 0.197044712851578, + "learning_rate": 1.3372555913218994e-05, + "loss": 0.2664, + "num_tokens": 1936600590.0, + "step": 2538 + }, + { + "epoch": 3.4642975259513724, + "grad_norm": 0.1852762750949683, + "learning_rate": 1.3367760510176706e-05, + "loss": 0.2606, + "num_tokens": 1937386674.0, + "step": 2539 + }, + { + "epoch": 3.465663106910053, + "grad_norm": 0.1977998907819235, + "learning_rate": 1.3362964384833226e-05, + "loss": 0.2696, + "num_tokens": 1938199391.0, + "step": 2540 + }, + { + "epoch": 3.4670286878687335, + "grad_norm": 0.19029359552648914, + "learning_rate": 1.3358167538651645e-05, + "loss": 0.2794, + "num_tokens": 1938919703.0, + "step": 2541 + }, + { + "epoch": 3.468394268827414, + "grad_norm": 0.1962455699681018, + "learning_rate": 1.3353369973095268e-05, + "loss": 0.2649, + "num_tokens": 1939589122.0, + "step": 2542 + }, + { + "epoch": 3.4697598497860946, + "grad_norm": 0.20725837787793033, + "learning_rate": 1.3348571689627629e-05, + "loss": 0.2607, + "num_tokens": 1940293135.0, + "step": 2543 + }, + { + "epoch": 3.471125430744775, + "grad_norm": 0.20254833021537585, + "learning_rate": 1.3343772689712477e-05, + "loss": 0.2868, + "num_tokens": 1941056699.0, + "step": 2544 + }, + { + "epoch": 3.4724910117034558, + "grad_norm": 0.2145236154593661, + "learning_rate": 1.3338972974813773e-05, + "loss": 0.2655, + "num_tokens": 1941771437.0, + "step": 2545 + }, + { + "epoch": 3.473856592662136, + "grad_norm": 0.19949528200866834, + "learning_rate": 1.3334172546395711e-05, + "loss": 0.2728, + "num_tokens": 1942524165.0, + "step": 2546 + }, + { + "epoch": 3.4752221736208164, + "grad_norm": 0.20771278234908777, + "learning_rate": 1.3329371405922688e-05, + "loss": 0.2632, + "num_tokens": 1943256919.0, + "step": 2547 + }, + { + "epoch": 3.476587754579497, + "grad_norm": 0.41857656568113377, + "learning_rate": 1.3324569554859325e-05, + "loss": 0.2636, + "num_tokens": 1943962410.0, + "step": 2548 + }, + { + "epoch": 3.4779533355381775, + "grad_norm": 0.2209847204678078, + "learning_rate": 1.3319766994670462e-05, + "loss": 0.2824, + "num_tokens": 1944748848.0, + "step": 2549 + }, + { + "epoch": 3.479318916496858, + "grad_norm": 0.20713280345759105, + "learning_rate": 1.3314963726821146e-05, + "loss": 0.2752, + "num_tokens": 1945517537.0, + "step": 2550 + }, + { + "epoch": 3.4806844974555387, + "grad_norm": 0.19283812412856524, + "learning_rate": 1.3310159752776652e-05, + "loss": 0.2741, + "num_tokens": 1946276483.0, + "step": 2551 + }, + { + "epoch": 3.482050078414219, + "grad_norm": 0.19909290020160622, + "learning_rate": 1.3305355074002461e-05, + "loss": 0.2708, + "num_tokens": 1947018662.0, + "step": 2552 + }, + { + "epoch": 3.4834156593728998, + "grad_norm": 0.2003826252323021, + "learning_rate": 1.3300549691964272e-05, + "loss": 0.2605, + "num_tokens": 1947804065.0, + "step": 2553 + }, + { + "epoch": 3.4847812403315803, + "grad_norm": 0.19213728246126396, + "learning_rate": 1.3295743608128002e-05, + "loss": 0.2665, + "num_tokens": 1948582658.0, + "step": 2554 + }, + { + "epoch": 3.4861468212902604, + "grad_norm": 0.19020390872365578, + "learning_rate": 1.3290936823959778e-05, + "loss": 0.2649, + "num_tokens": 1949378355.0, + "step": 2555 + }, + { + "epoch": 3.487512402248941, + "grad_norm": 0.1919980306058776, + "learning_rate": 1.3286129340925939e-05, + "loss": 0.2762, + "num_tokens": 1950131762.0, + "step": 2556 + }, + { + "epoch": 3.4888779832076215, + "grad_norm": 0.2009257651250307, + "learning_rate": 1.3281321160493043e-05, + "loss": 0.2648, + "num_tokens": 1950896103.0, + "step": 2557 + }, + { + "epoch": 3.490243564166302, + "grad_norm": 0.1995506703565711, + "learning_rate": 1.327651228412786e-05, + "loss": 0.2756, + "num_tokens": 1951643190.0, + "step": 2558 + }, + { + "epoch": 3.4916091451249827, + "grad_norm": 0.185958526724269, + "learning_rate": 1.3271702713297362e-05, + "loss": 0.2701, + "num_tokens": 1952365984.0, + "step": 2559 + }, + { + "epoch": 3.492974726083663, + "grad_norm": 0.18734736780097885, + "learning_rate": 1.3266892449468745e-05, + "loss": 0.2674, + "num_tokens": 1953134885.0, + "step": 2560 + }, + { + "epoch": 3.4943403070423438, + "grad_norm": 0.19251823110019026, + "learning_rate": 1.3262081494109415e-05, + "loss": 0.2759, + "num_tokens": 1953934754.0, + "step": 2561 + }, + { + "epoch": 3.4957058880010243, + "grad_norm": 0.19199313849764513, + "learning_rate": 1.3257269848686982e-05, + "loss": 0.2767, + "num_tokens": 1954617801.0, + "step": 2562 + }, + { + "epoch": 3.497071468959705, + "grad_norm": 0.18380246582856521, + "learning_rate": 1.3252457514669272e-05, + "loss": 0.2686, + "num_tokens": 1955332262.0, + "step": 2563 + }, + { + "epoch": 3.498437049918385, + "grad_norm": 0.19681265557312358, + "learning_rate": 1.3247644493524323e-05, + "loss": 0.2608, + "num_tokens": 1956079591.0, + "step": 2564 + }, + { + "epoch": 3.4998026308770656, + "grad_norm": 0.17713832483423744, + "learning_rate": 1.3242830786720373e-05, + "loss": 0.2665, + "num_tokens": 1956785519.0, + "step": 2565 + }, + { + "epoch": 3.501168211835746, + "grad_norm": 0.18800492911250277, + "learning_rate": 1.323801639572588e-05, + "loss": 0.2757, + "num_tokens": 1957605451.0, + "step": 2566 + }, + { + "epoch": 3.5025337927944267, + "grad_norm": 0.19447126754028624, + "learning_rate": 1.3233201322009504e-05, + "loss": 0.2688, + "num_tokens": 1958488516.0, + "step": 2567 + }, + { + "epoch": 3.5038993737531072, + "grad_norm": 0.18832072408479703, + "learning_rate": 1.3228385567040116e-05, + "loss": 0.2781, + "num_tokens": 1959308605.0, + "step": 2568 + }, + { + "epoch": 3.505264954711788, + "grad_norm": 0.17783297275431081, + "learning_rate": 1.3223569132286795e-05, + "loss": 0.2788, + "num_tokens": 1960109856.0, + "step": 2569 + }, + { + "epoch": 3.5066305356704683, + "grad_norm": 0.20042694905817546, + "learning_rate": 1.3218752019218829e-05, + "loss": 0.2733, + "num_tokens": 1960831380.0, + "step": 2570 + }, + { + "epoch": 3.507996116629149, + "grad_norm": 0.18949246502137007, + "learning_rate": 1.3213934229305702e-05, + "loss": 0.2664, + "num_tokens": 1961549095.0, + "step": 2571 + }, + { + "epoch": 3.5093616975878295, + "grad_norm": 0.18964435621637324, + "learning_rate": 1.3209115764017125e-05, + "loss": 0.2765, + "num_tokens": 1962328061.0, + "step": 2572 + }, + { + "epoch": 3.5107272785465096, + "grad_norm": 0.1928220862944233, + "learning_rate": 1.3204296624822994e-05, + "loss": 0.28, + "num_tokens": 1963089803.0, + "step": 2573 + }, + { + "epoch": 3.51209285950519, + "grad_norm": 0.18587022024654912, + "learning_rate": 1.3199476813193426e-05, + "loss": 0.2688, + "num_tokens": 1963873791.0, + "step": 2574 + }, + { + "epoch": 3.5134584404638707, + "grad_norm": 0.17641316441617477, + "learning_rate": 1.319465633059873e-05, + "loss": 0.2742, + "num_tokens": 1964710938.0, + "step": 2575 + }, + { + "epoch": 3.5148240214225512, + "grad_norm": 0.18636422222444946, + "learning_rate": 1.3189835178509427e-05, + "loss": 0.2736, + "num_tokens": 1965517489.0, + "step": 2576 + }, + { + "epoch": 3.516189602381232, + "grad_norm": 0.18407179287461142, + "learning_rate": 1.3185013358396253e-05, + "loss": 0.2826, + "num_tokens": 1966268072.0, + "step": 2577 + }, + { + "epoch": 3.5175551833399124, + "grad_norm": 0.19614632794925257, + "learning_rate": 1.3180190871730121e-05, + "loss": 0.2802, + "num_tokens": 1967054925.0, + "step": 2578 + }, + { + "epoch": 3.518920764298593, + "grad_norm": 0.20634466786235944, + "learning_rate": 1.3175367719982173e-05, + "loss": 0.2716, + "num_tokens": 1967832618.0, + "step": 2579 + }, + { + "epoch": 3.5202863452572735, + "grad_norm": 0.20206473937702774, + "learning_rate": 1.3170543904623745e-05, + "loss": 0.2717, + "num_tokens": 1968520499.0, + "step": 2580 + }, + { + "epoch": 3.521651926215954, + "grad_norm": 0.2329915402251311, + "learning_rate": 1.3165719427126364e-05, + "loss": 0.283, + "num_tokens": 1969224492.0, + "step": 2581 + }, + { + "epoch": 3.523017507174634, + "grad_norm": 0.2014463586915332, + "learning_rate": 1.3160894288961776e-05, + "loss": 0.2601, + "num_tokens": 1969983578.0, + "step": 2582 + }, + { + "epoch": 3.5243830881333147, + "grad_norm": 0.18346402291264471, + "learning_rate": 1.3156068491601924e-05, + "loss": 0.2777, + "num_tokens": 1970780398.0, + "step": 2583 + }, + { + "epoch": 3.5257486690919952, + "grad_norm": 0.18669106487588266, + "learning_rate": 1.3151242036518943e-05, + "loss": 0.2829, + "num_tokens": 1971546284.0, + "step": 2584 + }, + { + "epoch": 3.527114250050676, + "grad_norm": 0.18970488198479252, + "learning_rate": 1.3146414925185177e-05, + "loss": 0.2658, + "num_tokens": 1972321640.0, + "step": 2585 + }, + { + "epoch": 3.5284798310093564, + "grad_norm": 0.17836610048866824, + "learning_rate": 1.3141587159073176e-05, + "loss": 0.2724, + "num_tokens": 1973092596.0, + "step": 2586 + }, + { + "epoch": 3.529845411968037, + "grad_norm": 0.191779004659685, + "learning_rate": 1.3136758739655674e-05, + "loss": 0.2815, + "num_tokens": 1973788094.0, + "step": 2587 + }, + { + "epoch": 3.5312109929267175, + "grad_norm": 0.209553315428444, + "learning_rate": 1.3131929668405618e-05, + "loss": 0.2702, + "num_tokens": 1974611603.0, + "step": 2588 + }, + { + "epoch": 3.532576573885398, + "grad_norm": 0.18428076226219425, + "learning_rate": 1.3127099946796146e-05, + "loss": 0.282, + "num_tokens": 1975392803.0, + "step": 2589 + }, + { + "epoch": 3.5339421548440786, + "grad_norm": 0.18122919137488255, + "learning_rate": 1.3122269576300597e-05, + "loss": 0.2746, + "num_tokens": 1976171281.0, + "step": 2590 + }, + { + "epoch": 3.5353077358027587, + "grad_norm": 0.1905703479033559, + "learning_rate": 1.3117438558392503e-05, + "loss": 0.2876, + "num_tokens": 1976960959.0, + "step": 2591 + }, + { + "epoch": 3.5366733167614393, + "grad_norm": 0.19754457966522895, + "learning_rate": 1.3112606894545609e-05, + "loss": 0.2631, + "num_tokens": 1977733352.0, + "step": 2592 + }, + { + "epoch": 3.53803889772012, + "grad_norm": 0.1912159607643056, + "learning_rate": 1.310777458623384e-05, + "loss": 0.2759, + "num_tokens": 1978468671.0, + "step": 2593 + }, + { + "epoch": 3.5394044786788004, + "grad_norm": 0.18585790325928842, + "learning_rate": 1.3102941634931323e-05, + "loss": 0.2819, + "num_tokens": 1979266086.0, + "step": 2594 + }, + { + "epoch": 3.540770059637481, + "grad_norm": 0.2079923511216364, + "learning_rate": 1.3098108042112385e-05, + "loss": 0.2847, + "num_tokens": 1980066232.0, + "step": 2595 + }, + { + "epoch": 3.5421356405961615, + "grad_norm": 0.19334730459406207, + "learning_rate": 1.3093273809251546e-05, + "loss": 0.2692, + "num_tokens": 1980925828.0, + "step": 2596 + }, + { + "epoch": 3.543501221554842, + "grad_norm": 0.19617916186719186, + "learning_rate": 1.3088438937823518e-05, + "loss": 0.2722, + "num_tokens": 1981656027.0, + "step": 2597 + }, + { + "epoch": 3.5448668025135226, + "grad_norm": 0.20125307903020132, + "learning_rate": 1.308360342930321e-05, + "loss": 0.2686, + "num_tokens": 1982459508.0, + "step": 2598 + }, + { + "epoch": 3.546232383472203, + "grad_norm": 0.1884102600359854, + "learning_rate": 1.3078767285165733e-05, + "loss": 0.2744, + "num_tokens": 1983192509.0, + "step": 2599 + }, + { + "epoch": 3.5475979644308833, + "grad_norm": 0.2145912888357499, + "learning_rate": 1.3073930506886378e-05, + "loss": 0.277, + "num_tokens": 1983951615.0, + "step": 2600 + }, + { + "epoch": 3.548963545389564, + "grad_norm": 0.19796607180915476, + "learning_rate": 1.3069093095940644e-05, + "loss": 0.2779, + "num_tokens": 1984726049.0, + "step": 2601 + }, + { + "epoch": 3.5503291263482444, + "grad_norm": 0.1876446439700682, + "learning_rate": 1.3064255053804207e-05, + "loss": 0.279, + "num_tokens": 1985466794.0, + "step": 2602 + }, + { + "epoch": 3.551694707306925, + "grad_norm": 0.21125962050736097, + "learning_rate": 1.3059416381952953e-05, + "loss": 0.2791, + "num_tokens": 1986181342.0, + "step": 2603 + }, + { + "epoch": 3.5530602882656055, + "grad_norm": 0.19142851489082047, + "learning_rate": 1.305457708186294e-05, + "loss": 0.2657, + "num_tokens": 1987024765.0, + "step": 2604 + }, + { + "epoch": 3.554425869224286, + "grad_norm": 0.19094480285312718, + "learning_rate": 1.304973715501044e-05, + "loss": 0.2549, + "num_tokens": 1987886251.0, + "step": 2605 + }, + { + "epoch": 3.5557914501829666, + "grad_norm": 0.18115841333125474, + "learning_rate": 1.3044896602871899e-05, + "loss": 0.2566, + "num_tokens": 1988601355.0, + "step": 2606 + }, + { + "epoch": 3.557157031141647, + "grad_norm": 0.19172622005638135, + "learning_rate": 1.3040055426923961e-05, + "loss": 0.2782, + "num_tokens": 1989338032.0, + "step": 2607 + }, + { + "epoch": 3.5585226121003277, + "grad_norm": 0.1992859634815598, + "learning_rate": 1.3035213628643461e-05, + "loss": 0.2766, + "num_tokens": 1990166913.0, + "step": 2608 + }, + { + "epoch": 3.559888193059008, + "grad_norm": 0.1821027190112494, + "learning_rate": 1.303037120950742e-05, + "loss": 0.2762, + "num_tokens": 1990941252.0, + "step": 2609 + }, + { + "epoch": 3.5612537740176884, + "grad_norm": 0.18329733908695148, + "learning_rate": 1.302552817099305e-05, + "loss": 0.2719, + "num_tokens": 1991721991.0, + "step": 2610 + }, + { + "epoch": 3.562619354976369, + "grad_norm": 0.1954274203881713, + "learning_rate": 1.3020684514577757e-05, + "loss": 0.2732, + "num_tokens": 1992559224.0, + "step": 2611 + }, + { + "epoch": 3.5639849359350495, + "grad_norm": 0.1836353833501746, + "learning_rate": 1.3015840241739122e-05, + "loss": 0.2626, + "num_tokens": 1993262365.0, + "step": 2612 + }, + { + "epoch": 3.56535051689373, + "grad_norm": 0.20483328197908685, + "learning_rate": 1.3010995353954929e-05, + "loss": 0.2738, + "num_tokens": 1994020396.0, + "step": 2613 + }, + { + "epoch": 3.5667160978524106, + "grad_norm": 0.18888719127840975, + "learning_rate": 1.3006149852703145e-05, + "loss": 0.275, + "num_tokens": 1994809654.0, + "step": 2614 + }, + { + "epoch": 3.568081678811091, + "grad_norm": 0.17721732697252682, + "learning_rate": 1.300130373946192e-05, + "loss": 0.2754, + "num_tokens": 1995599209.0, + "step": 2615 + }, + { + "epoch": 3.5694472597697717, + "grad_norm": 0.21184523862972246, + "learning_rate": 1.2996457015709587e-05, + "loss": 0.2709, + "num_tokens": 1996275936.0, + "step": 2616 + }, + { + "epoch": 3.5708128407284523, + "grad_norm": 0.1894194308110246, + "learning_rate": 1.2991609682924683e-05, + "loss": 0.2712, + "num_tokens": 1997049058.0, + "step": 2617 + }, + { + "epoch": 3.5721784216871324, + "grad_norm": 0.19324649684130324, + "learning_rate": 1.2986761742585913e-05, + "loss": 0.279, + "num_tokens": 1997825402.0, + "step": 2618 + }, + { + "epoch": 3.573544002645813, + "grad_norm": 0.18049832777848993, + "learning_rate": 1.2981913196172176e-05, + "loss": 0.2604, + "num_tokens": 1998600108.0, + "step": 2619 + }, + { + "epoch": 3.5749095836044935, + "grad_norm": 0.1755039368072696, + "learning_rate": 1.2977064045162553e-05, + "loss": 0.2663, + "num_tokens": 1999312863.0, + "step": 2620 + }, + { + "epoch": 3.576275164563174, + "grad_norm": 0.183983226717234, + "learning_rate": 1.2972214291036307e-05, + "loss": 0.2737, + "num_tokens": 2000076698.0, + "step": 2621 + }, + { + "epoch": 3.5776407455218546, + "grad_norm": 0.19419771269729033, + "learning_rate": 1.296736393527289e-05, + "loss": 0.2732, + "num_tokens": 2000862634.0, + "step": 2622 + }, + { + "epoch": 3.579006326480535, + "grad_norm": 0.20014527878600524, + "learning_rate": 1.2962512979351936e-05, + "loss": 0.2915, + "num_tokens": 2001649940.0, + "step": 2623 + }, + { + "epoch": 3.5803719074392157, + "grad_norm": 0.19179278663010066, + "learning_rate": 1.295766142475326e-05, + "loss": 0.2702, + "num_tokens": 2002480706.0, + "step": 2624 + }, + { + "epoch": 3.5817374883978963, + "grad_norm": 0.20104056041765797, + "learning_rate": 1.2952809272956864e-05, + "loss": 0.2727, + "num_tokens": 2003238777.0, + "step": 2625 + }, + { + "epoch": 3.583103069356577, + "grad_norm": 0.20129892773419475, + "learning_rate": 1.2947956525442926e-05, + "loss": 0.2736, + "num_tokens": 2003940199.0, + "step": 2626 + }, + { + "epoch": 3.584468650315257, + "grad_norm": 0.2009056112660725, + "learning_rate": 1.2943103183691815e-05, + "loss": 0.2847, + "num_tokens": 2004695868.0, + "step": 2627 + }, + { + "epoch": 3.5858342312739375, + "grad_norm": 0.19656987007815585, + "learning_rate": 1.2938249249184067e-05, + "loss": 0.2742, + "num_tokens": 2005484205.0, + "step": 2628 + }, + { + "epoch": 3.587199812232618, + "grad_norm": 0.18548445520362633, + "learning_rate": 1.2933394723400414e-05, + "loss": 0.2741, + "num_tokens": 2006295966.0, + "step": 2629 + }, + { + "epoch": 3.5885653931912986, + "grad_norm": 0.1958557659730161, + "learning_rate": 1.2928539607821758e-05, + "loss": 0.2729, + "num_tokens": 2007121558.0, + "step": 2630 + }, + { + "epoch": 3.589930974149979, + "grad_norm": 0.18521889638111735, + "learning_rate": 1.2923683903929185e-05, + "loss": 0.2715, + "num_tokens": 2007926470.0, + "step": 2631 + }, + { + "epoch": 3.5912965551086597, + "grad_norm": 0.1905367404709362, + "learning_rate": 1.2918827613203964e-05, + "loss": 0.278, + "num_tokens": 2008778588.0, + "step": 2632 + }, + { + "epoch": 3.5926621360673403, + "grad_norm": 0.19606372132649233, + "learning_rate": 1.291397073712753e-05, + "loss": 0.2878, + "num_tokens": 2009540181.0, + "step": 2633 + }, + { + "epoch": 3.594027717026021, + "grad_norm": 0.19756952102606753, + "learning_rate": 1.2909113277181514e-05, + "loss": 0.2742, + "num_tokens": 2010296763.0, + "step": 2634 + }, + { + "epoch": 3.5953932979847014, + "grad_norm": 0.19932357284198993, + "learning_rate": 1.2904255234847718e-05, + "loss": 0.2799, + "num_tokens": 2011100490.0, + "step": 2635 + }, + { + "epoch": 3.5967588789433815, + "grad_norm": 0.19379643068183566, + "learning_rate": 1.2899396611608108e-05, + "loss": 0.2803, + "num_tokens": 2011863073.0, + "step": 2636 + }, + { + "epoch": 3.598124459902062, + "grad_norm": 0.25561938339021284, + "learning_rate": 1.2894537408944852e-05, + "loss": 0.2751, + "num_tokens": 2012680225.0, + "step": 2637 + }, + { + "epoch": 3.5994900408607426, + "grad_norm": 0.19456877373999115, + "learning_rate": 1.288967762834028e-05, + "loss": 0.2851, + "num_tokens": 2013466051.0, + "step": 2638 + }, + { + "epoch": 3.600855621819423, + "grad_norm": 0.20637521910436038, + "learning_rate": 1.2884817271276895e-05, + "loss": 0.2789, + "num_tokens": 2014236841.0, + "step": 2639 + }, + { + "epoch": 3.6022212027781038, + "grad_norm": 0.761712527429455, + "learning_rate": 1.2879956339237385e-05, + "loss": 0.2786, + "num_tokens": 2014999108.0, + "step": 2640 + }, + { + "epoch": 3.6035867837367843, + "grad_norm": 0.20592064905848018, + "learning_rate": 1.2875094833704611e-05, + "loss": 0.2902, + "num_tokens": 2015847716.0, + "step": 2641 + }, + { + "epoch": 3.604952364695465, + "grad_norm": 0.19271108990906194, + "learning_rate": 1.2870232756161606e-05, + "loss": 0.264, + "num_tokens": 2016575341.0, + "step": 2642 + }, + { + "epoch": 3.6063179456541454, + "grad_norm": 0.22280050896852308, + "learning_rate": 1.2865370108091584e-05, + "loss": 0.2814, + "num_tokens": 2017308005.0, + "step": 2643 + }, + { + "epoch": 3.607683526612826, + "grad_norm": 0.20959048945477554, + "learning_rate": 1.286050689097792e-05, + "loss": 0.2852, + "num_tokens": 2018126576.0, + "step": 2644 + }, + { + "epoch": 3.609049107571506, + "grad_norm": 0.20277190340387008, + "learning_rate": 1.2855643106304177e-05, + "loss": 0.2728, + "num_tokens": 2018877119.0, + "step": 2645 + }, + { + "epoch": 3.6104146885301867, + "grad_norm": 0.23266271148964704, + "learning_rate": 1.285077875555408e-05, + "loss": 0.2744, + "num_tokens": 2019642747.0, + "step": 2646 + }, + { + "epoch": 3.611780269488867, + "grad_norm": 0.19618290816274078, + "learning_rate": 1.2845913840211541e-05, + "loss": 0.276, + "num_tokens": 2020352591.0, + "step": 2647 + }, + { + "epoch": 3.6131458504475478, + "grad_norm": 0.2152301427475682, + "learning_rate": 1.2841048361760624e-05, + "loss": 0.2813, + "num_tokens": 2021142134.0, + "step": 2648 + }, + { + "epoch": 3.6145114314062283, + "grad_norm": 0.1980488193021334, + "learning_rate": 1.2836182321685585e-05, + "loss": 0.2665, + "num_tokens": 2021845019.0, + "step": 2649 + }, + { + "epoch": 3.615877012364909, + "grad_norm": 0.1854119596073934, + "learning_rate": 1.2831315721470834e-05, + "loss": 0.2716, + "num_tokens": 2022542438.0, + "step": 2650 + }, + { + "epoch": 3.6172425933235894, + "grad_norm": 0.21120018728703158, + "learning_rate": 1.2826448562600969e-05, + "loss": 0.2769, + "num_tokens": 2023340615.0, + "step": 2651 + }, + { + "epoch": 3.61860817428227, + "grad_norm": 0.18852834828105855, + "learning_rate": 1.2821580846560738e-05, + "loss": 0.2693, + "num_tokens": 2024042940.0, + "step": 2652 + }, + { + "epoch": 3.6199737552409506, + "grad_norm": 0.20405466039905662, + "learning_rate": 1.281671257483508e-05, + "loss": 0.2831, + "num_tokens": 2024840946.0, + "step": 2653 + }, + { + "epoch": 3.6213393361996307, + "grad_norm": 0.18033997197077126, + "learning_rate": 1.2811843748909092e-05, + "loss": 0.2744, + "num_tokens": 2025551275.0, + "step": 2654 + }, + { + "epoch": 3.622704917158311, + "grad_norm": 0.19822049117702406, + "learning_rate": 1.2806974370268035e-05, + "loss": 0.2704, + "num_tokens": 2026300509.0, + "step": 2655 + }, + { + "epoch": 3.624070498116992, + "grad_norm": 0.19126032224378545, + "learning_rate": 1.2802104440397354e-05, + "loss": 0.2698, + "num_tokens": 2027123993.0, + "step": 2656 + }, + { + "epoch": 3.6254360790756723, + "grad_norm": 0.18868169854758937, + "learning_rate": 1.2797233960782654e-05, + "loss": 0.2714, + "num_tokens": 2027882375.0, + "step": 2657 + }, + { + "epoch": 3.626801660034353, + "grad_norm": 0.19494948350338692, + "learning_rate": 1.27923629329097e-05, + "loss": 0.2816, + "num_tokens": 2028646558.0, + "step": 2658 + }, + { + "epoch": 3.6281672409930334, + "grad_norm": 0.1997236727921334, + "learning_rate": 1.278749135826444e-05, + "loss": 0.2758, + "num_tokens": 2029375579.0, + "step": 2659 + }, + { + "epoch": 3.629532821951714, + "grad_norm": 0.196931508194056, + "learning_rate": 1.2782619238332975e-05, + "loss": 0.2792, + "num_tokens": 2030206674.0, + "step": 2660 + }, + { + "epoch": 3.6308984029103946, + "grad_norm": 0.19094941460790268, + "learning_rate": 1.2777746574601575e-05, + "loss": 0.2786, + "num_tokens": 2030931719.0, + "step": 2661 + }, + { + "epoch": 3.632263983869075, + "grad_norm": 0.18737084837277906, + "learning_rate": 1.2772873368556687e-05, + "loss": 0.2814, + "num_tokens": 2031646398.0, + "step": 2662 + }, + { + "epoch": 3.6336295648277552, + "grad_norm": 0.20165298162048342, + "learning_rate": 1.276799962168491e-05, + "loss": 0.2689, + "num_tokens": 2032355625.0, + "step": 2663 + }, + { + "epoch": 3.634995145786436, + "grad_norm": 0.19417234660385413, + "learning_rate": 1.2763125335473013e-05, + "loss": 0.2673, + "num_tokens": 2033085957.0, + "step": 2664 + }, + { + "epoch": 3.6363607267451163, + "grad_norm": 0.18236449629585605, + "learning_rate": 1.2758250511407929e-05, + "loss": 0.2714, + "num_tokens": 2033901005.0, + "step": 2665 + }, + { + "epoch": 3.637726307703797, + "grad_norm": 0.18038632013280667, + "learning_rate": 1.275337515097676e-05, + "loss": 0.2795, + "num_tokens": 2034712876.0, + "step": 2666 + }, + { + "epoch": 3.6390918886624775, + "grad_norm": 0.18734806689163802, + "learning_rate": 1.2748499255666764e-05, + "loss": 0.2756, + "num_tokens": 2035502788.0, + "step": 2667 + }, + { + "epoch": 3.640457469621158, + "grad_norm": 0.18340321434801612, + "learning_rate": 1.2743622826965368e-05, + "loss": 0.2731, + "num_tokens": 2036298910.0, + "step": 2668 + }, + { + "epoch": 3.6418230505798386, + "grad_norm": 0.1825691459463724, + "learning_rate": 1.2738745866360154e-05, + "loss": 0.2754, + "num_tokens": 2037066288.0, + "step": 2669 + }, + { + "epoch": 3.643188631538519, + "grad_norm": 0.18923821560165238, + "learning_rate": 1.2733868375338875e-05, + "loss": 0.2636, + "num_tokens": 2037799088.0, + "step": 2670 + }, + { + "epoch": 3.6445542124971997, + "grad_norm": 0.18046538366054035, + "learning_rate": 1.2728990355389443e-05, + "loss": 0.2637, + "num_tokens": 2038649924.0, + "step": 2671 + }, + { + "epoch": 3.64591979345588, + "grad_norm": 0.18167410958431113, + "learning_rate": 1.2724111807999933e-05, + "loss": 0.2856, + "num_tokens": 2039486132.0, + "step": 2672 + }, + { + "epoch": 3.6472853744145604, + "grad_norm": 0.1859579098610536, + "learning_rate": 1.2719232734658571e-05, + "loss": 0.2674, + "num_tokens": 2040273686.0, + "step": 2673 + }, + { + "epoch": 3.648650955373241, + "grad_norm": 0.18296367951891823, + "learning_rate": 1.271435313685376e-05, + "loss": 0.2545, + "num_tokens": 2040919930.0, + "step": 2674 + }, + { + "epoch": 3.6500165363319215, + "grad_norm": 0.18984239391911453, + "learning_rate": 1.2709473016074045e-05, + "loss": 0.2672, + "num_tokens": 2041672532.0, + "step": 2675 + }, + { + "epoch": 3.651382117290602, + "grad_norm": 0.17753564751444464, + "learning_rate": 1.2704592373808142e-05, + "loss": 0.2664, + "num_tokens": 2042391154.0, + "step": 2676 + }, + { + "epoch": 3.6527476982492826, + "grad_norm": 0.19559072663247842, + "learning_rate": 1.269971121154493e-05, + "loss": 0.277, + "num_tokens": 2043094001.0, + "step": 2677 + }, + { + "epoch": 3.654113279207963, + "grad_norm": 0.1958963874317884, + "learning_rate": 1.2694829530773434e-05, + "loss": 0.2895, + "num_tokens": 2043924247.0, + "step": 2678 + }, + { + "epoch": 3.6554788601666437, + "grad_norm": 0.19344698258123805, + "learning_rate": 1.2689947332982843e-05, + "loss": 0.2692, + "num_tokens": 2044639662.0, + "step": 2679 + }, + { + "epoch": 3.6568444411253243, + "grad_norm": 0.21124840603401285, + "learning_rate": 1.2685064619662509e-05, + "loss": 0.2784, + "num_tokens": 2045380574.0, + "step": 2680 + }, + { + "epoch": 3.6582100220840044, + "grad_norm": 0.18657868552412768, + "learning_rate": 1.2680181392301933e-05, + "loss": 0.2695, + "num_tokens": 2046148297.0, + "step": 2681 + }, + { + "epoch": 3.659575603042685, + "grad_norm": 0.19755029582517397, + "learning_rate": 1.2675297652390779e-05, + "loss": 0.2751, + "num_tokens": 2046869787.0, + "step": 2682 + }, + { + "epoch": 3.6609411840013655, + "grad_norm": 0.19215252672737523, + "learning_rate": 1.2670413401418857e-05, + "loss": 0.2651, + "num_tokens": 2047625042.0, + "step": 2683 + }, + { + "epoch": 3.662306764960046, + "grad_norm": 0.18767241124428832, + "learning_rate": 1.2665528640876149e-05, + "loss": 0.2648, + "num_tokens": 2048306904.0, + "step": 2684 + }, + { + "epoch": 3.6636723459187266, + "grad_norm": 0.21343399165080304, + "learning_rate": 1.2660643372252779e-05, + "loss": 0.2614, + "num_tokens": 2049055911.0, + "step": 2685 + }, + { + "epoch": 3.665037926877407, + "grad_norm": 0.18964528445779366, + "learning_rate": 1.265575759703903e-05, + "loss": 0.2676, + "num_tokens": 2049874221.0, + "step": 2686 + }, + { + "epoch": 3.6664035078360877, + "grad_norm": 0.16806798512924506, + "learning_rate": 1.265087131672535e-05, + "loss": 0.2775, + "num_tokens": 2050594653.0, + "step": 2687 + }, + { + "epoch": 3.6677690887947683, + "grad_norm": 0.1879488780591285, + "learning_rate": 1.2645984532802318e-05, + "loss": 0.26, + "num_tokens": 2051268200.0, + "step": 2688 + }, + { + "epoch": 3.669134669753449, + "grad_norm": 0.20844188428551907, + "learning_rate": 1.264109724676069e-05, + "loss": 0.2704, + "num_tokens": 2052102523.0, + "step": 2689 + }, + { + "epoch": 3.670500250712129, + "grad_norm": 0.16878883160675093, + "learning_rate": 1.263620946009136e-05, + "loss": 0.2752, + "num_tokens": 2052888796.0, + "step": 2690 + }, + { + "epoch": 3.6718658316708095, + "grad_norm": 0.19544866381460788, + "learning_rate": 1.2631321174285385e-05, + "loss": 0.2683, + "num_tokens": 2053609407.0, + "step": 2691 + }, + { + "epoch": 3.67323141262949, + "grad_norm": 0.18885411564657836, + "learning_rate": 1.2626432390833962e-05, + "loss": 0.2737, + "num_tokens": 2054391530.0, + "step": 2692 + }, + { + "epoch": 3.6745969935881706, + "grad_norm": 0.18302791371672222, + "learning_rate": 1.2621543111228454e-05, + "loss": 0.2827, + "num_tokens": 2055192638.0, + "step": 2693 + }, + { + "epoch": 3.675962574546851, + "grad_norm": 0.1938932014844546, + "learning_rate": 1.2616653336960368e-05, + "loss": 0.2713, + "num_tokens": 2055933861.0, + "step": 2694 + }, + { + "epoch": 3.6773281555055317, + "grad_norm": 0.18332711214206296, + "learning_rate": 1.2611763069521355e-05, + "loss": 0.2774, + "num_tokens": 2056692053.0, + "step": 2695 + }, + { + "epoch": 3.6786937364642123, + "grad_norm": 0.2031376915225029, + "learning_rate": 1.260687231040323e-05, + "loss": 0.2728, + "num_tokens": 2057436737.0, + "step": 2696 + }, + { + "epoch": 3.680059317422893, + "grad_norm": 0.19479105811512495, + "learning_rate": 1.2601981061097957e-05, + "loss": 0.2736, + "num_tokens": 2058121427.0, + "step": 2697 + }, + { + "epoch": 3.6814248983815734, + "grad_norm": 0.20509748801547273, + "learning_rate": 1.2597089323097631e-05, + "loss": 0.263, + "num_tokens": 2058847138.0, + "step": 2698 + }, + { + "epoch": 3.6827904793402535, + "grad_norm": 0.18469655679963684, + "learning_rate": 1.2592197097894518e-05, + "loss": 0.2667, + "num_tokens": 2059574393.0, + "step": 2699 + }, + { + "epoch": 3.684156060298934, + "grad_norm": 0.20066495874261378, + "learning_rate": 1.2587304386981025e-05, + "loss": 0.2587, + "num_tokens": 2060297424.0, + "step": 2700 + }, + { + "epoch": 3.6855216412576146, + "grad_norm": 0.18155228010531013, + "learning_rate": 1.2582411191849702e-05, + "loss": 0.2769, + "num_tokens": 2061085934.0, + "step": 2701 + }, + { + "epoch": 3.686887222216295, + "grad_norm": 0.18600413435343388, + "learning_rate": 1.2577517513993251e-05, + "loss": 0.2801, + "num_tokens": 2061849310.0, + "step": 2702 + }, + { + "epoch": 3.6882528031749757, + "grad_norm": 0.19659669083187198, + "learning_rate": 1.2572623354904525e-05, + "loss": 0.2772, + "num_tokens": 2062601441.0, + "step": 2703 + }, + { + "epoch": 3.6896183841336563, + "grad_norm": 0.18447906898323174, + "learning_rate": 1.2567728716076519e-05, + "loss": 0.2803, + "num_tokens": 2063379133.0, + "step": 2704 + }, + { + "epoch": 3.690983965092337, + "grad_norm": 0.20722936681569312, + "learning_rate": 1.2562833599002376e-05, + "loss": 0.2791, + "num_tokens": 2064167002.0, + "step": 2705 + }, + { + "epoch": 3.6923495460510174, + "grad_norm": 0.19300014821417424, + "learning_rate": 1.2557938005175381e-05, + "loss": 0.2654, + "num_tokens": 2064910953.0, + "step": 2706 + }, + { + "epoch": 3.693715127009698, + "grad_norm": 0.19303988058598234, + "learning_rate": 1.2553041936088965e-05, + "loss": 0.275, + "num_tokens": 2065690666.0, + "step": 2707 + }, + { + "epoch": 3.695080707968378, + "grad_norm": 0.2047642652989613, + "learning_rate": 1.2548145393236715e-05, + "loss": 0.2891, + "num_tokens": 2066501502.0, + "step": 2708 + }, + { + "epoch": 3.6964462889270586, + "grad_norm": 0.1999941675856263, + "learning_rate": 1.254324837811235e-05, + "loss": 0.2691, + "num_tokens": 2067216923.0, + "step": 2709 + }, + { + "epoch": 3.697811869885739, + "grad_norm": 0.18586705101566192, + "learning_rate": 1.2538350892209734e-05, + "loss": 0.2704, + "num_tokens": 2068030916.0, + "step": 2710 + }, + { + "epoch": 3.6991774508444197, + "grad_norm": 0.20901193848640354, + "learning_rate": 1.2533452937022883e-05, + "loss": 0.273, + "num_tokens": 2068880764.0, + "step": 2711 + }, + { + "epoch": 3.7005430318031003, + "grad_norm": 0.17975874011744003, + "learning_rate": 1.252855451404595e-05, + "loss": 0.276, + "num_tokens": 2069671861.0, + "step": 2712 + }, + { + "epoch": 3.701908612761781, + "grad_norm": 0.18706385627403072, + "learning_rate": 1.2523655624773232e-05, + "loss": 0.2733, + "num_tokens": 2070450799.0, + "step": 2713 + }, + { + "epoch": 3.7032741937204614, + "grad_norm": 0.18975455102639588, + "learning_rate": 1.2518756270699164e-05, + "loss": 0.2733, + "num_tokens": 2071242957.0, + "step": 2714 + }, + { + "epoch": 3.704639774679142, + "grad_norm": 0.1807031933967458, + "learning_rate": 1.2513856453318332e-05, + "loss": 0.2838, + "num_tokens": 2072099839.0, + "step": 2715 + }, + { + "epoch": 3.7060053556378225, + "grad_norm": 0.18606169726167773, + "learning_rate": 1.2508956174125452e-05, + "loss": 0.2808, + "num_tokens": 2072865067.0, + "step": 2716 + }, + { + "epoch": 3.7073709365965026, + "grad_norm": 0.19416276301640129, + "learning_rate": 1.2504055434615393e-05, + "loss": 0.2763, + "num_tokens": 2073667708.0, + "step": 2717 + }, + { + "epoch": 3.708736517555183, + "grad_norm": 0.18527070927025358, + "learning_rate": 1.2499154236283157e-05, + "loss": 0.2833, + "num_tokens": 2074422608.0, + "step": 2718 + }, + { + "epoch": 3.7101020985138637, + "grad_norm": 0.18707274044725591, + "learning_rate": 1.2494252580623884e-05, + "loss": 0.2637, + "num_tokens": 2075228491.0, + "step": 2719 + }, + { + "epoch": 3.7114676794725443, + "grad_norm": 0.19452307798846163, + "learning_rate": 1.248935046913286e-05, + "loss": 0.2891, + "num_tokens": 2075963678.0, + "step": 2720 + }, + { + "epoch": 3.712833260431225, + "grad_norm": 0.20477156153089343, + "learning_rate": 1.2484447903305507e-05, + "loss": 0.2718, + "num_tokens": 2076709381.0, + "step": 2721 + }, + { + "epoch": 3.7141988413899054, + "grad_norm": 0.18372817797044128, + "learning_rate": 1.2479544884637387e-05, + "loss": 0.2621, + "num_tokens": 2077410213.0, + "step": 2722 + }, + { + "epoch": 3.715564422348586, + "grad_norm": 0.1911337726059762, + "learning_rate": 1.2474641414624191e-05, + "loss": 0.2746, + "num_tokens": 2078195231.0, + "step": 2723 + }, + { + "epoch": 3.7169300033072665, + "grad_norm": 0.18050590815202203, + "learning_rate": 1.2469737494761765e-05, + "loss": 0.2839, + "num_tokens": 2079025859.0, + "step": 2724 + }, + { + "epoch": 3.718295584265947, + "grad_norm": 0.20577172301925137, + "learning_rate": 1.2464833126546073e-05, + "loss": 0.2681, + "num_tokens": 2079723610.0, + "step": 2725 + }, + { + "epoch": 3.719661165224627, + "grad_norm": 0.19845720403249834, + "learning_rate": 1.2459928311473238e-05, + "loss": 0.2594, + "num_tokens": 2080443951.0, + "step": 2726 + }, + { + "epoch": 3.7210267461833078, + "grad_norm": 0.19297395835064893, + "learning_rate": 1.2455023051039496e-05, + "loss": 0.289, + "num_tokens": 2081248047.0, + "step": 2727 + }, + { + "epoch": 3.7223923271419883, + "grad_norm": 0.19574325622487967, + "learning_rate": 1.245011734674123e-05, + "loss": 0.2677, + "num_tokens": 2081990849.0, + "step": 2728 + }, + { + "epoch": 3.723757908100669, + "grad_norm": 0.17840723045202134, + "learning_rate": 1.2445211200074965e-05, + "loss": 0.2687, + "num_tokens": 2082732321.0, + "step": 2729 + }, + { + "epoch": 3.7251234890593494, + "grad_norm": 0.1846245191307462, + "learning_rate": 1.2440304612537345e-05, + "loss": 0.2635, + "num_tokens": 2083419807.0, + "step": 2730 + }, + { + "epoch": 3.72648907001803, + "grad_norm": 0.1950389053377489, + "learning_rate": 1.243539758562517e-05, + "loss": 0.2694, + "num_tokens": 2084122527.0, + "step": 2731 + }, + { + "epoch": 3.7278546509767105, + "grad_norm": 0.20307223338996785, + "learning_rate": 1.2430490120835343e-05, + "loss": 0.2785, + "num_tokens": 2084893016.0, + "step": 2732 + }, + { + "epoch": 3.729220231935391, + "grad_norm": 0.18975066496594, + "learning_rate": 1.2425582219664936e-05, + "loss": 0.2714, + "num_tokens": 2085649748.0, + "step": 2733 + }, + { + "epoch": 3.7305858128940717, + "grad_norm": 0.19244402354453602, + "learning_rate": 1.2420673883611127e-05, + "loss": 0.2838, + "num_tokens": 2086398509.0, + "step": 2734 + }, + { + "epoch": 3.7319513938527518, + "grad_norm": 0.19615807187116674, + "learning_rate": 1.2415765114171244e-05, + "loss": 0.2821, + "num_tokens": 2087238268.0, + "step": 2735 + }, + { + "epoch": 3.7333169748114323, + "grad_norm": 0.1724599414399223, + "learning_rate": 1.2410855912842734e-05, + "loss": 0.2825, + "num_tokens": 2087998936.0, + "step": 2736 + }, + { + "epoch": 3.734682555770113, + "grad_norm": 0.18935645652590646, + "learning_rate": 1.2405946281123186e-05, + "loss": 0.2858, + "num_tokens": 2088770875.0, + "step": 2737 + }, + { + "epoch": 3.7360481367287934, + "grad_norm": 0.18294946197861378, + "learning_rate": 1.2401036220510312e-05, + "loss": 0.2843, + "num_tokens": 2089559813.0, + "step": 2738 + }, + { + "epoch": 3.737413717687474, + "grad_norm": 0.17980966805747078, + "learning_rate": 1.239612573250196e-05, + "loss": 0.263, + "num_tokens": 2090266449.0, + "step": 2739 + }, + { + "epoch": 3.7387792986461545, + "grad_norm": 0.2060281870093605, + "learning_rate": 1.239121481859611e-05, + "loss": 0.2736, + "num_tokens": 2090959476.0, + "step": 2740 + }, + { + "epoch": 3.740144879604835, + "grad_norm": 0.18594089023363577, + "learning_rate": 1.2386303480290866e-05, + "loss": 0.2719, + "num_tokens": 2091747617.0, + "step": 2741 + }, + { + "epoch": 3.7415104605635157, + "grad_norm": 0.1797515906850866, + "learning_rate": 1.2381391719084465e-05, + "loss": 0.266, + "num_tokens": 2092555125.0, + "step": 2742 + }, + { + "epoch": 3.742876041522196, + "grad_norm": 0.18644546579867172, + "learning_rate": 1.2376479536475276e-05, + "loss": 0.2571, + "num_tokens": 2093277052.0, + "step": 2743 + }, + { + "epoch": 3.7442416224808763, + "grad_norm": 0.1942348020856113, + "learning_rate": 1.2371566933961791e-05, + "loss": 0.2778, + "num_tokens": 2093993271.0, + "step": 2744 + }, + { + "epoch": 3.745607203439557, + "grad_norm": 0.19911230108369407, + "learning_rate": 1.2366653913042633e-05, + "loss": 0.2759, + "num_tokens": 2094738564.0, + "step": 2745 + }, + { + "epoch": 3.7469727843982374, + "grad_norm": 0.19480569966507755, + "learning_rate": 1.2361740475216547e-05, + "loss": 0.2694, + "num_tokens": 2095489229.0, + "step": 2746 + }, + { + "epoch": 3.748338365356918, + "grad_norm": 0.17135929265441227, + "learning_rate": 1.2356826621982417e-05, + "loss": 0.2816, + "num_tokens": 2096289641.0, + "step": 2747 + }, + { + "epoch": 3.7497039463155986, + "grad_norm": 0.1937363727022568, + "learning_rate": 1.235191235483924e-05, + "loss": 0.2746, + "num_tokens": 2097060714.0, + "step": 2748 + }, + { + "epoch": 3.751069527274279, + "grad_norm": 0.17720977198663948, + "learning_rate": 1.2346997675286152e-05, + "loss": 0.27, + "num_tokens": 2097783057.0, + "step": 2749 + }, + { + "epoch": 3.7524351082329597, + "grad_norm": 0.1952270103136921, + "learning_rate": 1.2342082584822405e-05, + "loss": 0.2816, + "num_tokens": 2098559100.0, + "step": 2750 + }, + { + "epoch": 3.7538006891916402, + "grad_norm": 0.17959210305848036, + "learning_rate": 1.2337167084947385e-05, + "loss": 0.2729, + "num_tokens": 2099299098.0, + "step": 2751 + }, + { + "epoch": 3.755166270150321, + "grad_norm": 0.1866105064013948, + "learning_rate": 1.2332251177160596e-05, + "loss": 0.2676, + "num_tokens": 2100012292.0, + "step": 2752 + }, + { + "epoch": 3.756531851109001, + "grad_norm": 0.19921434939732213, + "learning_rate": 1.2327334862961665e-05, + "loss": 0.2819, + "num_tokens": 2100781243.0, + "step": 2753 + }, + { + "epoch": 3.7578974320676815, + "grad_norm": 0.19295128626425426, + "learning_rate": 1.2322418143850352e-05, + "loss": 0.2739, + "num_tokens": 2101515045.0, + "step": 2754 + }, + { + "epoch": 3.759263013026362, + "grad_norm": 0.18955166324852463, + "learning_rate": 1.231750102132653e-05, + "loss": 0.2674, + "num_tokens": 2102278399.0, + "step": 2755 + }, + { + "epoch": 3.7606285939850426, + "grad_norm": 0.1919399417360505, + "learning_rate": 1.2312583496890203e-05, + "loss": 0.2782, + "num_tokens": 2103039788.0, + "step": 2756 + }, + { + "epoch": 3.761994174943723, + "grad_norm": 0.19163597591661544, + "learning_rate": 1.2307665572041492e-05, + "loss": 0.2775, + "num_tokens": 2103772826.0, + "step": 2757 + }, + { + "epoch": 3.7633597559024037, + "grad_norm": 0.20857181673066194, + "learning_rate": 1.2302747248280651e-05, + "loss": 0.2781, + "num_tokens": 2104529421.0, + "step": 2758 + }, + { + "epoch": 3.7647253368610842, + "grad_norm": 0.1783327092192551, + "learning_rate": 1.2297828527108035e-05, + "loss": 0.2743, + "num_tokens": 2105241715.0, + "step": 2759 + }, + { + "epoch": 3.766090917819765, + "grad_norm": 0.2003581026522545, + "learning_rate": 1.2292909410024145e-05, + "loss": 0.2751, + "num_tokens": 2106076525.0, + "step": 2760 + }, + { + "epoch": 3.7674564987784454, + "grad_norm": 0.2032644801572929, + "learning_rate": 1.2287989898529582e-05, + "loss": 0.2896, + "num_tokens": 2106873302.0, + "step": 2761 + }, + { + "epoch": 3.7688220797371255, + "grad_norm": 0.18686585403459757, + "learning_rate": 1.228306999412508e-05, + "loss": 0.2829, + "num_tokens": 2107671218.0, + "step": 2762 + }, + { + "epoch": 3.770187660695806, + "grad_norm": 0.19075484788691757, + "learning_rate": 1.2278149698311488e-05, + "loss": 0.2691, + "num_tokens": 2108421521.0, + "step": 2763 + }, + { + "epoch": 3.7715532416544866, + "grad_norm": 0.19155880134762765, + "learning_rate": 1.2273229012589775e-05, + "loss": 0.2801, + "num_tokens": 2109174248.0, + "step": 2764 + }, + { + "epoch": 3.772918822613167, + "grad_norm": 0.19576166616485824, + "learning_rate": 1.2268307938461028e-05, + "loss": 0.2698, + "num_tokens": 2109856941.0, + "step": 2765 + }, + { + "epoch": 3.7742844035718477, + "grad_norm": 0.20556887999602738, + "learning_rate": 1.2263386477426455e-05, + "loss": 0.2707, + "num_tokens": 2110656486.0, + "step": 2766 + }, + { + "epoch": 3.7756499845305282, + "grad_norm": 0.1824956213016528, + "learning_rate": 1.2258464630987381e-05, + "loss": 0.2778, + "num_tokens": 2111457511.0, + "step": 2767 + }, + { + "epoch": 3.777015565489209, + "grad_norm": 0.19585954563985702, + "learning_rate": 1.2253542400645251e-05, + "loss": 0.2745, + "num_tokens": 2112183886.0, + "step": 2768 + }, + { + "epoch": 3.7783811464478894, + "grad_norm": 0.19659560064469153, + "learning_rate": 1.2248619787901616e-05, + "loss": 0.2747, + "num_tokens": 2112963378.0, + "step": 2769 + }, + { + "epoch": 3.77974672740657, + "grad_norm": 0.19174071238717225, + "learning_rate": 1.2243696794258158e-05, + "loss": 0.2639, + "num_tokens": 2113739620.0, + "step": 2770 + }, + { + "epoch": 3.78111230836525, + "grad_norm": 0.21871994435625502, + "learning_rate": 1.2238773421216669e-05, + "loss": 0.2978, + "num_tokens": 2114482553.0, + "step": 2771 + }, + { + "epoch": 3.7824778893239306, + "grad_norm": 0.21344046758266486, + "learning_rate": 1.2233849670279054e-05, + "loss": 0.2609, + "num_tokens": 2115305947.0, + "step": 2772 + }, + { + "epoch": 3.783843470282611, + "grad_norm": 0.19034071300625777, + "learning_rate": 1.2228925542947336e-05, + "loss": 0.2818, + "num_tokens": 2116055659.0, + "step": 2773 + }, + { + "epoch": 3.7852090512412917, + "grad_norm": 0.19200904017164308, + "learning_rate": 1.2224001040723661e-05, + "loss": 0.2831, + "num_tokens": 2116843763.0, + "step": 2774 + }, + { + "epoch": 3.7865746321999723, + "grad_norm": 0.1832623049268719, + "learning_rate": 1.2219076165110273e-05, + "loss": 0.2769, + "num_tokens": 2117637942.0, + "step": 2775 + }, + { + "epoch": 3.787940213158653, + "grad_norm": 0.18449035620230572, + "learning_rate": 1.2214150917609542e-05, + "loss": 0.2657, + "num_tokens": 2118347747.0, + "step": 2776 + }, + { + "epoch": 3.7893057941173334, + "grad_norm": 0.18194880416656836, + "learning_rate": 1.2209225299723945e-05, + "loss": 0.265, + "num_tokens": 2119104242.0, + "step": 2777 + }, + { + "epoch": 3.790671375076014, + "grad_norm": 0.20309455134351426, + "learning_rate": 1.2204299312956077e-05, + "loss": 0.2857, + "num_tokens": 2119900035.0, + "step": 2778 + }, + { + "epoch": 3.7920369560346945, + "grad_norm": 0.19734914208437798, + "learning_rate": 1.219937295880864e-05, + "loss": 0.2803, + "num_tokens": 2120604547.0, + "step": 2779 + }, + { + "epoch": 3.7934025369933746, + "grad_norm": 0.19987788932810982, + "learning_rate": 1.2194446238784455e-05, + "loss": 0.2773, + "num_tokens": 2121377425.0, + "step": 2780 + }, + { + "epoch": 3.794768117952055, + "grad_norm": 0.19005768219147293, + "learning_rate": 1.2189519154386448e-05, + "loss": 0.2662, + "num_tokens": 2122167477.0, + "step": 2781 + }, + { + "epoch": 3.7961336989107357, + "grad_norm": 0.18833601065097397, + "learning_rate": 1.2184591707117664e-05, + "loss": 0.2654, + "num_tokens": 2122968575.0, + "step": 2782 + }, + { + "epoch": 3.7974992798694163, + "grad_norm": 0.17927847203948158, + "learning_rate": 1.217966389848125e-05, + "loss": 0.2728, + "num_tokens": 2123699037.0, + "step": 2783 + }, + { + "epoch": 3.798864860828097, + "grad_norm": 0.2059540543092451, + "learning_rate": 1.2174735729980468e-05, + "loss": 0.2691, + "num_tokens": 2124458783.0, + "step": 2784 + }, + { + "epoch": 3.8002304417867774, + "grad_norm": 0.190636278591924, + "learning_rate": 1.2169807203118684e-05, + "loss": 0.2787, + "num_tokens": 2125297712.0, + "step": 2785 + }, + { + "epoch": 3.801596022745458, + "grad_norm": 0.18251056275473332, + "learning_rate": 1.2164878319399385e-05, + "loss": 0.2708, + "num_tokens": 2126064413.0, + "step": 2786 + }, + { + "epoch": 3.8029616037041385, + "grad_norm": 0.1937658326660195, + "learning_rate": 1.2159949080326156e-05, + "loss": 0.2804, + "num_tokens": 2126828111.0, + "step": 2787 + }, + { + "epoch": 3.804327184662819, + "grad_norm": 0.18992459991923663, + "learning_rate": 1.2155019487402697e-05, + "loss": 0.2668, + "num_tokens": 2127641641.0, + "step": 2788 + }, + { + "epoch": 3.805692765621499, + "grad_norm": 0.18240824864502217, + "learning_rate": 1.2150089542132815e-05, + "loss": 0.2573, + "num_tokens": 2128350185.0, + "step": 2789 + }, + { + "epoch": 3.8070583465801797, + "grad_norm": 0.18653203793404666, + "learning_rate": 1.2145159246020418e-05, + "loss": 0.2808, + "num_tokens": 2129111241.0, + "step": 2790 + }, + { + "epoch": 3.8084239275388603, + "grad_norm": 0.19422800088850103, + "learning_rate": 1.2140228600569525e-05, + "loss": 0.2701, + "num_tokens": 2129806659.0, + "step": 2791 + }, + { + "epoch": 3.809789508497541, + "grad_norm": 0.18943705583676687, + "learning_rate": 1.2135297607284268e-05, + "loss": 0.2736, + "num_tokens": 2130571628.0, + "step": 2792 + }, + { + "epoch": 3.8111550894562214, + "grad_norm": 0.19482674299618707, + "learning_rate": 1.2130366267668874e-05, + "loss": 0.2835, + "num_tokens": 2131389267.0, + "step": 2793 + }, + { + "epoch": 3.812520670414902, + "grad_norm": 0.21489912199630065, + "learning_rate": 1.2125434583227684e-05, + "loss": 0.288, + "num_tokens": 2132215214.0, + "step": 2794 + }, + { + "epoch": 3.8138862513735825, + "grad_norm": 0.18065096597718308, + "learning_rate": 1.2120502555465144e-05, + "loss": 0.2552, + "num_tokens": 2132958581.0, + "step": 2795 + }, + { + "epoch": 3.815251832332263, + "grad_norm": 0.2368835459542438, + "learning_rate": 1.2115570185885796e-05, + "loss": 0.2828, + "num_tokens": 2133691840.0, + "step": 2796 + }, + { + "epoch": 3.8166174132909436, + "grad_norm": 0.20623990902897232, + "learning_rate": 1.2110637475994295e-05, + "loss": 0.2669, + "num_tokens": 2134449953.0, + "step": 2797 + }, + { + "epoch": 3.8179829942496237, + "grad_norm": 0.19153767221914167, + "learning_rate": 1.2105704427295402e-05, + "loss": 0.2723, + "num_tokens": 2135283520.0, + "step": 2798 + }, + { + "epoch": 3.8193485752083043, + "grad_norm": 0.19623636416372042, + "learning_rate": 1.2100771041293971e-05, + "loss": 0.2721, + "num_tokens": 2136064065.0, + "step": 2799 + }, + { + "epoch": 3.820714156166985, + "grad_norm": 0.1824177792349838, + "learning_rate": 1.2095837319494963e-05, + "loss": 0.2801, + "num_tokens": 2136834562.0, + "step": 2800 + }, + { + "epoch": 3.8220797371256654, + "grad_norm": 0.18645821438240134, + "learning_rate": 1.2090903263403446e-05, + "loss": 0.2673, + "num_tokens": 2137572153.0, + "step": 2801 + }, + { + "epoch": 3.823445318084346, + "grad_norm": 0.19188856532441442, + "learning_rate": 1.2085968874524587e-05, + "loss": 0.2751, + "num_tokens": 2138372245.0, + "step": 2802 + }, + { + "epoch": 3.8248108990430265, + "grad_norm": 0.19120158184617256, + "learning_rate": 1.2081034154363653e-05, + "loss": 0.2772, + "num_tokens": 2139168394.0, + "step": 2803 + }, + { + "epoch": 3.826176480001707, + "grad_norm": 0.19236220709216992, + "learning_rate": 1.2076099104426018e-05, + "loss": 0.2729, + "num_tokens": 2139866388.0, + "step": 2804 + }, + { + "epoch": 3.8275420609603876, + "grad_norm": 0.20212239880626054, + "learning_rate": 1.2071163726217146e-05, + "loss": 0.2739, + "num_tokens": 2140640628.0, + "step": 2805 + }, + { + "epoch": 3.828907641919068, + "grad_norm": 0.19017019631164922, + "learning_rate": 1.2066228021242605e-05, + "loss": 0.2699, + "num_tokens": 2141444463.0, + "step": 2806 + }, + { + "epoch": 3.8302732228777483, + "grad_norm": 0.2080933767334414, + "learning_rate": 1.2061291991008073e-05, + "loss": 0.2658, + "num_tokens": 2142203551.0, + "step": 2807 + }, + { + "epoch": 3.831638803836429, + "grad_norm": 0.1950021763887986, + "learning_rate": 1.205635563701932e-05, + "loss": 0.2628, + "num_tokens": 2142951167.0, + "step": 2808 + }, + { + "epoch": 3.8330043847951094, + "grad_norm": 0.20166263943806925, + "learning_rate": 1.20514189607822e-05, + "loss": 0.2802, + "num_tokens": 2143700954.0, + "step": 2809 + }, + { + "epoch": 3.83436996575379, + "grad_norm": 0.19513575118875268, + "learning_rate": 1.204648196380269e-05, + "loss": 0.2785, + "num_tokens": 2144482898.0, + "step": 2810 + }, + { + "epoch": 3.8357355467124705, + "grad_norm": 0.2041560762899071, + "learning_rate": 1.2041544647586858e-05, + "loss": 0.2813, + "num_tokens": 2145292145.0, + "step": 2811 + }, + { + "epoch": 3.837101127671151, + "grad_norm": 0.19593471493138664, + "learning_rate": 1.2036607013640852e-05, + "loss": 0.2814, + "num_tokens": 2146005386.0, + "step": 2812 + }, + { + "epoch": 3.8384667086298316, + "grad_norm": 0.18899230303622547, + "learning_rate": 1.2031669063470944e-05, + "loss": 0.2807, + "num_tokens": 2146727011.0, + "step": 2813 + }, + { + "epoch": 3.839832289588512, + "grad_norm": 0.2077225961454414, + "learning_rate": 1.202673079858348e-05, + "loss": 0.2764, + "num_tokens": 2147449349.0, + "step": 2814 + }, + { + "epoch": 3.8411978705471927, + "grad_norm": 0.18084673245410918, + "learning_rate": 1.2021792220484911e-05, + "loss": 0.2621, + "num_tokens": 2148238855.0, + "step": 2815 + }, + { + "epoch": 3.842563451505873, + "grad_norm": 0.19278863364816576, + "learning_rate": 1.2016853330681788e-05, + "loss": 0.2803, + "num_tokens": 2149026579.0, + "step": 2816 + }, + { + "epoch": 3.8439290324645534, + "grad_norm": 0.19799782079464068, + "learning_rate": 1.201191413068075e-05, + "loss": 0.2687, + "num_tokens": 2149752579.0, + "step": 2817 + }, + { + "epoch": 3.845294613423234, + "grad_norm": 0.1980322795448722, + "learning_rate": 1.2006974621988532e-05, + "loss": 0.2768, + "num_tokens": 2150432996.0, + "step": 2818 + }, + { + "epoch": 3.8466601943819145, + "grad_norm": 0.19640457970592315, + "learning_rate": 1.2002034806111967e-05, + "loss": 0.2634, + "num_tokens": 2151121894.0, + "step": 2819 + }, + { + "epoch": 3.848025775340595, + "grad_norm": 0.19285745425234604, + "learning_rate": 1.1997094684557977e-05, + "loss": 0.2685, + "num_tokens": 2151894627.0, + "step": 2820 + }, + { + "epoch": 3.8493913562992756, + "grad_norm": 0.18070522458846378, + "learning_rate": 1.1992154258833578e-05, + "loss": 0.2657, + "num_tokens": 2152610488.0, + "step": 2821 + }, + { + "epoch": 3.850756937257956, + "grad_norm": 0.2008182571667235, + "learning_rate": 1.1987213530445882e-05, + "loss": 0.2831, + "num_tokens": 2153407328.0, + "step": 2822 + }, + { + "epoch": 3.8521225182166368, + "grad_norm": 0.19121095027398888, + "learning_rate": 1.198227250090209e-05, + "loss": 0.2805, + "num_tokens": 2154180150.0, + "step": 2823 + }, + { + "epoch": 3.8534880991753173, + "grad_norm": 0.19650126759720318, + "learning_rate": 1.1977331171709497e-05, + "loss": 0.2713, + "num_tokens": 2154897339.0, + "step": 2824 + }, + { + "epoch": 3.8548536801339974, + "grad_norm": 0.18851202423737617, + "learning_rate": 1.1972389544375487e-05, + "loss": 0.2772, + "num_tokens": 2155695471.0, + "step": 2825 + }, + { + "epoch": 3.856219261092678, + "grad_norm": 0.1945974071933573, + "learning_rate": 1.1967447620407538e-05, + "loss": 0.2807, + "num_tokens": 2156512906.0, + "step": 2826 + }, + { + "epoch": 3.8575848420513585, + "grad_norm": 0.18654299157709153, + "learning_rate": 1.1962505401313216e-05, + "loss": 0.2632, + "num_tokens": 2157319399.0, + "step": 2827 + }, + { + "epoch": 3.858950423010039, + "grad_norm": 0.18124345267382863, + "learning_rate": 1.1957562888600178e-05, + "loss": 0.2712, + "num_tokens": 2158040814.0, + "step": 2828 + }, + { + "epoch": 3.8603160039687197, + "grad_norm": 0.18622432029295977, + "learning_rate": 1.1952620083776174e-05, + "loss": 0.2656, + "num_tokens": 2158781074.0, + "step": 2829 + }, + { + "epoch": 3.8616815849274, + "grad_norm": 0.18352174955932296, + "learning_rate": 1.1947676988349036e-05, + "loss": 0.2863, + "num_tokens": 2159516726.0, + "step": 2830 + }, + { + "epoch": 3.8630471658860808, + "grad_norm": 0.2015465600636992, + "learning_rate": 1.1942733603826688e-05, + "loss": 0.2828, + "num_tokens": 2160199172.0, + "step": 2831 + }, + { + "epoch": 3.8644127468447613, + "grad_norm": 0.20016052505064758, + "learning_rate": 1.1937789931717148e-05, + "loss": 0.2833, + "num_tokens": 2160921798.0, + "step": 2832 + }, + { + "epoch": 3.865778327803442, + "grad_norm": 0.21059898374287045, + "learning_rate": 1.1932845973528505e-05, + "loss": 0.2756, + "num_tokens": 2161608658.0, + "step": 2833 + }, + { + "epoch": 3.867143908762122, + "grad_norm": 0.20264706074826996, + "learning_rate": 1.1927901730768958e-05, + "loss": 0.2774, + "num_tokens": 2162449314.0, + "step": 2834 + }, + { + "epoch": 3.8685094897208026, + "grad_norm": 0.18907174333805163, + "learning_rate": 1.1922957204946777e-05, + "loss": 0.2748, + "num_tokens": 2163216093.0, + "step": 2835 + }, + { + "epoch": 3.869875070679483, + "grad_norm": 0.1840669265970139, + "learning_rate": 1.191801239757032e-05, + "loss": 0.2812, + "num_tokens": 2164044896.0, + "step": 2836 + }, + { + "epoch": 3.8712406516381637, + "grad_norm": 0.1898737548650166, + "learning_rate": 1.1913067310148038e-05, + "loss": 0.2784, + "num_tokens": 2164853966.0, + "step": 2837 + }, + { + "epoch": 3.8726062325968442, + "grad_norm": 0.18677686946709626, + "learning_rate": 1.1908121944188463e-05, + "loss": 0.2741, + "num_tokens": 2165651610.0, + "step": 2838 + }, + { + "epoch": 3.873971813555525, + "grad_norm": 0.20300931358661378, + "learning_rate": 1.1903176301200207e-05, + "loss": 0.2771, + "num_tokens": 2166371873.0, + "step": 2839 + }, + { + "epoch": 3.8753373945142053, + "grad_norm": 0.19275251200213692, + "learning_rate": 1.1898230382691977e-05, + "loss": 0.2747, + "num_tokens": 2167075399.0, + "step": 2840 + }, + { + "epoch": 3.876702975472886, + "grad_norm": 0.20894970136598703, + "learning_rate": 1.1893284190172556e-05, + "loss": 0.2649, + "num_tokens": 2167773417.0, + "step": 2841 + }, + { + "epoch": 3.8780685564315664, + "grad_norm": 0.20494836046975884, + "learning_rate": 1.1888337725150814e-05, + "loss": 0.2752, + "num_tokens": 2168528681.0, + "step": 2842 + }, + { + "epoch": 3.8794341373902466, + "grad_norm": 0.1872388471022263, + "learning_rate": 1.1883390989135702e-05, + "loss": 0.277, + "num_tokens": 2169247210.0, + "step": 2843 + }, + { + "epoch": 3.880799718348927, + "grad_norm": 0.20396414516300146, + "learning_rate": 1.1878443983636257e-05, + "loss": 0.2853, + "num_tokens": 2170036730.0, + "step": 2844 + }, + { + "epoch": 3.8821652993076077, + "grad_norm": 0.19272763526273273, + "learning_rate": 1.1873496710161591e-05, + "loss": 0.2768, + "num_tokens": 2170830379.0, + "step": 2845 + }, + { + "epoch": 3.8835308802662882, + "grad_norm": 0.18694973469301485, + "learning_rate": 1.1868549170220909e-05, + "loss": 0.2711, + "num_tokens": 2171494407.0, + "step": 2846 + }, + { + "epoch": 3.884896461224969, + "grad_norm": 0.19711953383933029, + "learning_rate": 1.1863601365323488e-05, + "loss": 0.2935, + "num_tokens": 2172275877.0, + "step": 2847 + }, + { + "epoch": 3.8862620421836493, + "grad_norm": 0.20055484595531897, + "learning_rate": 1.1858653296978691e-05, + "loss": 0.2706, + "num_tokens": 2173008333.0, + "step": 2848 + }, + { + "epoch": 3.88762762314233, + "grad_norm": 0.19573237679959776, + "learning_rate": 1.1853704966695959e-05, + "loss": 0.2813, + "num_tokens": 2173782623.0, + "step": 2849 + }, + { + "epoch": 3.8889932041010105, + "grad_norm": 0.20181516341560166, + "learning_rate": 1.1848756375984809e-05, + "loss": 0.2643, + "num_tokens": 2174539187.0, + "step": 2850 + }, + { + "epoch": 3.890358785059691, + "grad_norm": 0.17178869736908148, + "learning_rate": 1.1843807526354846e-05, + "loss": 0.2745, + "num_tokens": 2175233857.0, + "step": 2851 + }, + { + "epoch": 3.891724366018371, + "grad_norm": 0.20902971706297804, + "learning_rate": 1.183885841931575e-05, + "loss": 0.2721, + "num_tokens": 2175985301.0, + "step": 2852 + }, + { + "epoch": 3.8930899469770517, + "grad_norm": 0.1870193762909068, + "learning_rate": 1.1833909056377281e-05, + "loss": 0.2832, + "num_tokens": 2176673538.0, + "step": 2853 + }, + { + "epoch": 3.8944555279357322, + "grad_norm": 0.21194901856308482, + "learning_rate": 1.182895943904927e-05, + "loss": 0.2676, + "num_tokens": 2177477467.0, + "step": 2854 + }, + { + "epoch": 3.895821108894413, + "grad_norm": 0.17876395595503408, + "learning_rate": 1.1824009568841632e-05, + "loss": 0.2696, + "num_tokens": 2178287413.0, + "step": 2855 + }, + { + "epoch": 3.8971866898530934, + "grad_norm": 0.19487015420267695, + "learning_rate": 1.181905944726436e-05, + "loss": 0.2743, + "num_tokens": 2179088898.0, + "step": 2856 + }, + { + "epoch": 3.898552270811774, + "grad_norm": 0.18129217050232926, + "learning_rate": 1.1814109075827522e-05, + "loss": 0.2551, + "num_tokens": 2179824483.0, + "step": 2857 + }, + { + "epoch": 3.8999178517704545, + "grad_norm": 0.18490276960640273, + "learning_rate": 1.180915845604126e-05, + "loss": 0.2926, + "num_tokens": 2180643390.0, + "step": 2858 + }, + { + "epoch": 3.901283432729135, + "grad_norm": 0.1772768893657513, + "learning_rate": 1.1804207589415795e-05, + "loss": 0.2733, + "num_tokens": 2181382910.0, + "step": 2859 + }, + { + "epoch": 3.9026490136878156, + "grad_norm": 0.18512565574132048, + "learning_rate": 1.1799256477461422e-05, + "loss": 0.2646, + "num_tokens": 2182167083.0, + "step": 2860 + }, + { + "epoch": 3.9040145946464957, + "grad_norm": 0.18802635469625328, + "learning_rate": 1.1794305121688514e-05, + "loss": 0.2737, + "num_tokens": 2182941310.0, + "step": 2861 + }, + { + "epoch": 3.9053801756051763, + "grad_norm": 0.17408014720458487, + "learning_rate": 1.1789353523607507e-05, + "loss": 0.2726, + "num_tokens": 2183721088.0, + "step": 2862 + }, + { + "epoch": 3.906745756563857, + "grad_norm": 0.19765611183490855, + "learning_rate": 1.1784401684728925e-05, + "loss": 0.2787, + "num_tokens": 2184502423.0, + "step": 2863 + }, + { + "epoch": 3.9081113375225374, + "grad_norm": 0.1902035076905486, + "learning_rate": 1.177944960656336e-05, + "loss": 0.2561, + "num_tokens": 2185174272.0, + "step": 2864 + }, + { + "epoch": 3.909476918481218, + "grad_norm": 0.19412802819161168, + "learning_rate": 1.1774497290621474e-05, + "loss": 0.2779, + "num_tokens": 2185831387.0, + "step": 2865 + }, + { + "epoch": 3.9108424994398985, + "grad_norm": 0.21270900142292118, + "learning_rate": 1.1769544738414004e-05, + "loss": 0.273, + "num_tokens": 2186565981.0, + "step": 2866 + }, + { + "epoch": 3.912208080398579, + "grad_norm": 0.1902908766953016, + "learning_rate": 1.1764591951451762e-05, + "loss": 0.2677, + "num_tokens": 2187260143.0, + "step": 2867 + }, + { + "epoch": 3.9135736613572596, + "grad_norm": 0.19198980808764732, + "learning_rate": 1.1759638931245628e-05, + "loss": 0.2756, + "num_tokens": 2188040523.0, + "step": 2868 + }, + { + "epoch": 3.91493924231594, + "grad_norm": 0.18557396022064426, + "learning_rate": 1.1754685679306554e-05, + "loss": 0.2839, + "num_tokens": 2188818887.0, + "step": 2869 + }, + { + "epoch": 3.9163048232746203, + "grad_norm": 0.20098004087771795, + "learning_rate": 1.1749732197145558e-05, + "loss": 0.282, + "num_tokens": 2189578574.0, + "step": 2870 + }, + { + "epoch": 3.917670404233301, + "grad_norm": 0.1898873011734509, + "learning_rate": 1.174477848627374e-05, + "loss": 0.2816, + "num_tokens": 2190320830.0, + "step": 2871 + }, + { + "epoch": 3.9190359851919814, + "grad_norm": 0.193475367842913, + "learning_rate": 1.1739824548202259e-05, + "loss": 0.2745, + "num_tokens": 2191045672.0, + "step": 2872 + }, + { + "epoch": 3.920401566150662, + "grad_norm": 0.18985540353442987, + "learning_rate": 1.1734870384442345e-05, + "loss": 0.2836, + "num_tokens": 2191844236.0, + "step": 2873 + }, + { + "epoch": 3.9217671471093425, + "grad_norm": 0.19032991024893606, + "learning_rate": 1.1729915996505303e-05, + "loss": 0.2617, + "num_tokens": 2192534480.0, + "step": 2874 + }, + { + "epoch": 3.923132728068023, + "grad_norm": 0.19068933655181186, + "learning_rate": 1.17249613859025e-05, + "loss": 0.2776, + "num_tokens": 2193224003.0, + "step": 2875 + }, + { + "epoch": 3.9244983090267036, + "grad_norm": 0.20036886834900808, + "learning_rate": 1.1720006554145374e-05, + "loss": 0.277, + "num_tokens": 2194045349.0, + "step": 2876 + }, + { + "epoch": 3.925863889985384, + "grad_norm": 0.18162354002831063, + "learning_rate": 1.1715051502745432e-05, + "loss": 0.2707, + "num_tokens": 2194860407.0, + "step": 2877 + }, + { + "epoch": 3.9272294709440647, + "grad_norm": 0.19111614772196125, + "learning_rate": 1.171009623321424e-05, + "loss": 0.2788, + "num_tokens": 2195668267.0, + "step": 2878 + }, + { + "epoch": 3.928595051902745, + "grad_norm": 0.19190639560630313, + "learning_rate": 1.1705140747063439e-05, + "loss": 0.2668, + "num_tokens": 2196456925.0, + "step": 2879 + }, + { + "epoch": 3.9299606328614254, + "grad_norm": 0.17448997136061678, + "learning_rate": 1.1700185045804732e-05, + "loss": 0.2766, + "num_tokens": 2197266440.0, + "step": 2880 + }, + { + "epoch": 3.931326213820106, + "grad_norm": 0.16958770313199834, + "learning_rate": 1.1695229130949891e-05, + "loss": 0.2619, + "num_tokens": 2197904976.0, + "step": 2881 + }, + { + "epoch": 3.9326917947787865, + "grad_norm": 0.2110751657477687, + "learning_rate": 1.169027300401075e-05, + "loss": 0.2743, + "num_tokens": 2198629232.0, + "step": 2882 + }, + { + "epoch": 3.934057375737467, + "grad_norm": 0.18683787008807903, + "learning_rate": 1.1685316666499206e-05, + "loss": 0.2849, + "num_tokens": 2199372275.0, + "step": 2883 + }, + { + "epoch": 3.9354229566961476, + "grad_norm": 0.19733794407089136, + "learning_rate": 1.1680360119927232e-05, + "loss": 0.2698, + "num_tokens": 2200148857.0, + "step": 2884 + }, + { + "epoch": 3.936788537654828, + "grad_norm": 0.18568556341205966, + "learning_rate": 1.1675403365806848e-05, + "loss": 0.2705, + "num_tokens": 2200889604.0, + "step": 2885 + }, + { + "epoch": 3.9381541186135087, + "grad_norm": 0.17824599028413488, + "learning_rate": 1.1670446405650142e-05, + "loss": 0.2815, + "num_tokens": 2201713357.0, + "step": 2886 + }, + { + "epoch": 3.9395196995721893, + "grad_norm": 0.18160301330440995, + "learning_rate": 1.1665489240969273e-05, + "loss": 0.2735, + "num_tokens": 2202495196.0, + "step": 2887 + }, + { + "epoch": 3.9408852805308694, + "grad_norm": 0.19077299926070004, + "learning_rate": 1.1660531873276457e-05, + "loss": 0.2683, + "num_tokens": 2203258383.0, + "step": 2888 + }, + { + "epoch": 3.94225086148955, + "grad_norm": 0.18495759285160313, + "learning_rate": 1.1655574304083972e-05, + "loss": 0.2861, + "num_tokens": 2204029846.0, + "step": 2889 + }, + { + "epoch": 3.9436164424482305, + "grad_norm": 0.18633925999410986, + "learning_rate": 1.1650616534904158e-05, + "loss": 0.2948, + "num_tokens": 2204786928.0, + "step": 2890 + }, + { + "epoch": 3.944982023406911, + "grad_norm": 0.19250033367057048, + "learning_rate": 1.1645658567249414e-05, + "loss": 0.274, + "num_tokens": 2205531132.0, + "step": 2891 + }, + { + "epoch": 3.9463476043655916, + "grad_norm": 0.19274571411161967, + "learning_rate": 1.1640700402632204e-05, + "loss": 0.2644, + "num_tokens": 2206351932.0, + "step": 2892 + }, + { + "epoch": 3.947713185324272, + "grad_norm": 0.1676250651852623, + "learning_rate": 1.1635742042565046e-05, + "loss": 0.2851, + "num_tokens": 2207155947.0, + "step": 2893 + }, + { + "epoch": 3.9490787662829527, + "grad_norm": 0.18340675860214828, + "learning_rate": 1.1630783488560524e-05, + "loss": 0.272, + "num_tokens": 2207996712.0, + "step": 2894 + }, + { + "epoch": 3.9504443472416333, + "grad_norm": 0.1829074343008109, + "learning_rate": 1.1625824742131273e-05, + "loss": 0.2788, + "num_tokens": 2208785219.0, + "step": 2895 + }, + { + "epoch": 3.951809928200314, + "grad_norm": 0.18017690325060814, + "learning_rate": 1.1620865804789998e-05, + "loss": 0.2691, + "num_tokens": 2209527451.0, + "step": 2896 + }, + { + "epoch": 3.953175509158994, + "grad_norm": 0.19004819143239177, + "learning_rate": 1.1615906678049456e-05, + "loss": 0.2572, + "num_tokens": 2210272063.0, + "step": 2897 + }, + { + "epoch": 3.9545410901176745, + "grad_norm": 0.1799236341485269, + "learning_rate": 1.1610947363422455e-05, + "loss": 0.2728, + "num_tokens": 2211077885.0, + "step": 2898 + }, + { + "epoch": 3.955906671076355, + "grad_norm": 0.1863873518844361, + "learning_rate": 1.160598786242187e-05, + "loss": 0.2814, + "num_tokens": 2211845544.0, + "step": 2899 + }, + { + "epoch": 3.9572722520350356, + "grad_norm": 0.18782859082686726, + "learning_rate": 1.1601028176560638e-05, + "loss": 0.269, + "num_tokens": 2212599513.0, + "step": 2900 + }, + { + "epoch": 3.958637832993716, + "grad_norm": 0.19778566364365016, + "learning_rate": 1.159606830735173e-05, + "loss": 0.2676, + "num_tokens": 2213334775.0, + "step": 2901 + }, + { + "epoch": 3.9600034139523967, + "grad_norm": 0.17974034418024576, + "learning_rate": 1.1591108256308199e-05, + "loss": 0.278, + "num_tokens": 2214186527.0, + "step": 2902 + }, + { + "epoch": 3.9613689949110773, + "grad_norm": 0.19966645226381996, + "learning_rate": 1.1586148024943139e-05, + "loss": 0.2725, + "num_tokens": 2214913266.0, + "step": 2903 + }, + { + "epoch": 3.962734575869758, + "grad_norm": 0.19502215134012568, + "learning_rate": 1.1581187614769697e-05, + "loss": 0.2788, + "num_tokens": 2215657562.0, + "step": 2904 + }, + { + "epoch": 3.9641001568284384, + "grad_norm": 0.1791941090336952, + "learning_rate": 1.1576227027301083e-05, + "loss": 0.2687, + "num_tokens": 2216420065.0, + "step": 2905 + }, + { + "epoch": 3.9654657377871185, + "grad_norm": 0.19711852886322961, + "learning_rate": 1.157126626405056e-05, + "loss": 0.2595, + "num_tokens": 2217179107.0, + "step": 2906 + }, + { + "epoch": 3.966831318745799, + "grad_norm": 0.1838505357100646, + "learning_rate": 1.1566305326531437e-05, + "loss": 0.2636, + "num_tokens": 2217875360.0, + "step": 2907 + }, + { + "epoch": 3.9681968997044796, + "grad_norm": 0.18495498299911414, + "learning_rate": 1.1561344216257087e-05, + "loss": 0.2657, + "num_tokens": 2218615164.0, + "step": 2908 + }, + { + "epoch": 3.96956248066316, + "grad_norm": 0.18997089681925353, + "learning_rate": 1.1556382934740926e-05, + "loss": 0.2696, + "num_tokens": 2219426841.0, + "step": 2909 + }, + { + "epoch": 3.9709280616218408, + "grad_norm": 0.1789481299060695, + "learning_rate": 1.1551421483496423e-05, + "loss": 0.2787, + "num_tokens": 2220217734.0, + "step": 2910 + }, + { + "epoch": 3.9722936425805213, + "grad_norm": 0.18948043335010747, + "learning_rate": 1.1546459864037105e-05, + "loss": 0.2626, + "num_tokens": 2220952925.0, + "step": 2911 + }, + { + "epoch": 3.973659223539202, + "grad_norm": 0.18081036130446076, + "learning_rate": 1.1541498077876552e-05, + "loss": 0.267, + "num_tokens": 2221740519.0, + "step": 2912 + }, + { + "epoch": 3.9750248044978824, + "grad_norm": 0.17924454203021675, + "learning_rate": 1.153653612652838e-05, + "loss": 0.2663, + "num_tokens": 2222435362.0, + "step": 2913 + }, + { + "epoch": 3.976390385456563, + "grad_norm": 0.1877619924262889, + "learning_rate": 1.1531574011506274e-05, + "loss": 0.2705, + "num_tokens": 2223221232.0, + "step": 2914 + }, + { + "epoch": 3.977755966415243, + "grad_norm": 0.188899741527969, + "learning_rate": 1.1526611734323957e-05, + "loss": 0.2846, + "num_tokens": 2224014841.0, + "step": 2915 + }, + { + "epoch": 3.9791215473739237, + "grad_norm": 0.18898338604630513, + "learning_rate": 1.152164929649521e-05, + "loss": 0.2644, + "num_tokens": 2224704990.0, + "step": 2916 + }, + { + "epoch": 3.980487128332604, + "grad_norm": 0.18259753813004154, + "learning_rate": 1.1516686699533848e-05, + "loss": 0.2794, + "num_tokens": 2225370203.0, + "step": 2917 + }, + { + "epoch": 3.9818527092912848, + "grad_norm": 0.2051986681127575, + "learning_rate": 1.1511723944953752e-05, + "loss": 0.2792, + "num_tokens": 2226114677.0, + "step": 2918 + }, + { + "epoch": 3.9832182902499653, + "grad_norm": 0.1819290013582697, + "learning_rate": 1.150676103426884e-05, + "loss": 0.2638, + "num_tokens": 2226873356.0, + "step": 2919 + }, + { + "epoch": 3.984583871208646, + "grad_norm": 0.16729362640826265, + "learning_rate": 1.1501797968993082e-05, + "loss": 0.2728, + "num_tokens": 2227558989.0, + "step": 2920 + }, + { + "epoch": 3.9859494521673264, + "grad_norm": 0.1857059618879145, + "learning_rate": 1.1496834750640497e-05, + "loss": 0.2794, + "num_tokens": 2228345563.0, + "step": 2921 + }, + { + "epoch": 3.987315033126007, + "grad_norm": 0.18494213735430767, + "learning_rate": 1.1491871380725143e-05, + "loss": 0.2754, + "num_tokens": 2229074785.0, + "step": 2922 + }, + { + "epoch": 3.9886806140846875, + "grad_norm": 0.1879672316171795, + "learning_rate": 1.1486907860761136e-05, + "loss": 0.2787, + "num_tokens": 2229803139.0, + "step": 2923 + }, + { + "epoch": 3.9900461950433677, + "grad_norm": 0.18580227268952013, + "learning_rate": 1.1481944192262625e-05, + "loss": 0.267, + "num_tokens": 2230574186.0, + "step": 2924 + }, + { + "epoch": 3.991411776002048, + "grad_norm": 0.1794287409782044, + "learning_rate": 1.1476980376743811e-05, + "loss": 0.2827, + "num_tokens": 2231362494.0, + "step": 2925 + }, + { + "epoch": 3.9927773569607288, + "grad_norm": 0.18372679374058787, + "learning_rate": 1.147201641571894e-05, + "loss": 0.269, + "num_tokens": 2232149136.0, + "step": 2926 + }, + { + "epoch": 3.9941429379194093, + "grad_norm": 0.18404995919382425, + "learning_rate": 1.1467052310702303e-05, + "loss": 0.2663, + "num_tokens": 2232837205.0, + "step": 2927 + }, + { + "epoch": 3.99550851887809, + "grad_norm": 0.18852185320196313, + "learning_rate": 1.1462088063208232e-05, + "loss": 0.2924, + "num_tokens": 2233653939.0, + "step": 2928 + }, + { + "epoch": 3.9968740998367704, + "grad_norm": 0.19534993573658424, + "learning_rate": 1.1457123674751103e-05, + "loss": 0.2725, + "num_tokens": 2234432943.0, + "step": 2929 + }, + { + "epoch": 3.998239680795451, + "grad_norm": 0.17807193972253368, + "learning_rate": 1.1452159146845341e-05, + "loss": 0.2772, + "num_tokens": 2235176833.0, + "step": 2930 + }, + { + "epoch": 3.9996052617541316, + "grad_norm": 0.18177812987879696, + "learning_rate": 1.1447194481005399e-05, + "loss": 0.276, + "num_tokens": 2235940096.0, + "step": 2931 + }, + { + "epoch": 4.0, + "grad_norm": 0.259530751404826, + "learning_rate": 1.1442229678745793e-05, + "loss": 0.2413, + "num_tokens": 2236168544.0, + "step": 2932 + }, + { + "epoch": 4.00136558095868, + "grad_norm": 0.3567205198469742, + "learning_rate": 1.143726474158106e-05, + "loss": 0.2435, + "num_tokens": 2236987360.0, + "step": 2933 + }, + { + "epoch": 4.002731161917361, + "grad_norm": 0.3116754729913395, + "learning_rate": 1.1432299671025792e-05, + "loss": 0.244, + "num_tokens": 2237758713.0, + "step": 2934 + }, + { + "epoch": 4.004096742876041, + "grad_norm": 0.23221563228636755, + "learning_rate": 1.1427334468594613e-05, + "loss": 0.2408, + "num_tokens": 2238587284.0, + "step": 2935 + }, + { + "epoch": 4.005462323834722, + "grad_norm": 0.2821871462432716, + "learning_rate": 1.1422369135802197e-05, + "loss": 0.2394, + "num_tokens": 2239322064.0, + "step": 2936 + }, + { + "epoch": 4.006827904793402, + "grad_norm": 0.34115144000672554, + "learning_rate": 1.1417403674163248e-05, + "loss": 0.2341, + "num_tokens": 2240110907.0, + "step": 2937 + }, + { + "epoch": 4.008193485752083, + "grad_norm": 0.30077723103714255, + "learning_rate": 1.1412438085192512e-05, + "loss": 0.2378, + "num_tokens": 2240870632.0, + "step": 2938 + }, + { + "epoch": 4.0095590667107635, + "grad_norm": 0.2382640466385962, + "learning_rate": 1.1407472370404783e-05, + "loss": 0.2393, + "num_tokens": 2241638141.0, + "step": 2939 + }, + { + "epoch": 4.0109246476694445, + "grad_norm": 0.28695913251003635, + "learning_rate": 1.1402506531314875e-05, + "loss": 0.236, + "num_tokens": 2242379438.0, + "step": 2940 + }, + { + "epoch": 4.012290228628125, + "grad_norm": 0.2505091018236723, + "learning_rate": 1.1397540569437657e-05, + "loss": 0.2297, + "num_tokens": 2243077339.0, + "step": 2941 + }, + { + "epoch": 4.013655809586805, + "grad_norm": 0.2416135761754938, + "learning_rate": 1.1392574486288026e-05, + "loss": 0.2323, + "num_tokens": 2243833947.0, + "step": 2942 + }, + { + "epoch": 4.015021390545486, + "grad_norm": 0.2089812544514556, + "learning_rate": 1.1387608283380921e-05, + "loss": 0.2258, + "num_tokens": 2244580544.0, + "step": 2943 + }, + { + "epoch": 4.016386971504166, + "grad_norm": 0.2402718057030777, + "learning_rate": 1.1382641962231316e-05, + "loss": 0.2291, + "num_tokens": 2245321912.0, + "step": 2944 + }, + { + "epoch": 4.017752552462847, + "grad_norm": 0.27322868807481676, + "learning_rate": 1.1377675524354214e-05, + "loss": 0.2405, + "num_tokens": 2246131359.0, + "step": 2945 + }, + { + "epoch": 4.019118133421527, + "grad_norm": 0.22983203955466802, + "learning_rate": 1.1372708971264672e-05, + "loss": 0.2436, + "num_tokens": 2246933675.0, + "step": 2946 + }, + { + "epoch": 4.020483714380208, + "grad_norm": 0.2097140513996182, + "learning_rate": 1.136774230447776e-05, + "loss": 0.2327, + "num_tokens": 2247689412.0, + "step": 2947 + }, + { + "epoch": 4.021849295338888, + "grad_norm": 0.21786118623329076, + "learning_rate": 1.1362775525508597e-05, + "loss": 0.2289, + "num_tokens": 2248483329.0, + "step": 2948 + }, + { + "epoch": 4.023214876297569, + "grad_norm": 0.22194180757690518, + "learning_rate": 1.1357808635872335e-05, + "loss": 0.2412, + "num_tokens": 2249242772.0, + "step": 2949 + }, + { + "epoch": 4.024580457256249, + "grad_norm": 0.20848770662541802, + "learning_rate": 1.1352841637084149e-05, + "loss": 0.2247, + "num_tokens": 2250015676.0, + "step": 2950 + }, + { + "epoch": 4.025946038214929, + "grad_norm": 0.23970581517768458, + "learning_rate": 1.134787453065926e-05, + "loss": 0.2338, + "num_tokens": 2250748202.0, + "step": 2951 + }, + { + "epoch": 4.02731161917361, + "grad_norm": 0.2122427112895838, + "learning_rate": 1.1342907318112919e-05, + "loss": 0.2363, + "num_tokens": 2251483866.0, + "step": 2952 + }, + { + "epoch": 4.02867720013229, + "grad_norm": 0.23648340520337005, + "learning_rate": 1.1337940000960403e-05, + "loss": 0.2293, + "num_tokens": 2252274994.0, + "step": 2953 + }, + { + "epoch": 4.030042781090971, + "grad_norm": 0.20418911812607834, + "learning_rate": 1.1332972580717028e-05, + "loss": 0.2253, + "num_tokens": 2252950740.0, + "step": 2954 + }, + { + "epoch": 4.0314083620496515, + "grad_norm": 0.21687164625774571, + "learning_rate": 1.1328005058898139e-05, + "loss": 0.2344, + "num_tokens": 2253725540.0, + "step": 2955 + }, + { + "epoch": 4.0327739430083325, + "grad_norm": 0.20600558735052868, + "learning_rate": 1.1323037437019113e-05, + "loss": 0.2286, + "num_tokens": 2254562951.0, + "step": 2956 + }, + { + "epoch": 4.034139523967013, + "grad_norm": 0.20001676869141166, + "learning_rate": 1.1318069716595353e-05, + "loss": 0.2263, + "num_tokens": 2255328853.0, + "step": 2957 + }, + { + "epoch": 4.035505104925694, + "grad_norm": 0.2252733489817729, + "learning_rate": 1.13131018991423e-05, + "loss": 0.2384, + "num_tokens": 2256029792.0, + "step": 2958 + }, + { + "epoch": 4.036870685884374, + "grad_norm": 0.22880066081324976, + "learning_rate": 1.1308133986175412e-05, + "loss": 0.2245, + "num_tokens": 2256690837.0, + "step": 2959 + }, + { + "epoch": 4.038236266843054, + "grad_norm": 0.20723609057088757, + "learning_rate": 1.1303165979210189e-05, + "loss": 0.2265, + "num_tokens": 2257404593.0, + "step": 2960 + }, + { + "epoch": 4.039601847801735, + "grad_norm": 0.25350003585749636, + "learning_rate": 1.1298197879762158e-05, + "loss": 0.228, + "num_tokens": 2258142808.0, + "step": 2961 + }, + { + "epoch": 4.040967428760415, + "grad_norm": 0.2303284502804076, + "learning_rate": 1.1293229689346868e-05, + "loss": 0.2403, + "num_tokens": 2258926808.0, + "step": 2962 + }, + { + "epoch": 4.042333009719096, + "grad_norm": 0.19898947145167858, + "learning_rate": 1.1288261409479898e-05, + "loss": 0.2334, + "num_tokens": 2259768969.0, + "step": 2963 + }, + { + "epoch": 4.043698590677776, + "grad_norm": 0.19214636908592034, + "learning_rate": 1.128329304167685e-05, + "loss": 0.2316, + "num_tokens": 2260539230.0, + "step": 2964 + }, + { + "epoch": 4.045064171636457, + "grad_norm": 0.2274160130600864, + "learning_rate": 1.1278324587453369e-05, + "loss": 0.2395, + "num_tokens": 2261278397.0, + "step": 2965 + }, + { + "epoch": 4.046429752595137, + "grad_norm": 0.19315576180610558, + "learning_rate": 1.1273356048325107e-05, + "loss": 0.2289, + "num_tokens": 2262055951.0, + "step": 2966 + }, + { + "epoch": 4.047795333553818, + "grad_norm": 0.20215573834102485, + "learning_rate": 1.1268387425807753e-05, + "loss": 0.2334, + "num_tokens": 2262797673.0, + "step": 2967 + }, + { + "epoch": 4.049160914512498, + "grad_norm": 0.42570247561264096, + "learning_rate": 1.1263418721417023e-05, + "loss": 0.2328, + "num_tokens": 2263620425.0, + "step": 2968 + }, + { + "epoch": 4.050526495471178, + "grad_norm": 0.22980395835194284, + "learning_rate": 1.1258449936668643e-05, + "loss": 0.2393, + "num_tokens": 2264328474.0, + "step": 2969 + }, + { + "epoch": 4.051892076429859, + "grad_norm": 0.2127558652513868, + "learning_rate": 1.1253481073078382e-05, + "loss": 0.2241, + "num_tokens": 2265066284.0, + "step": 2970 + }, + { + "epoch": 4.0532576573885395, + "grad_norm": 0.19927313426214288, + "learning_rate": 1.1248512132162026e-05, + "loss": 0.2292, + "num_tokens": 2265837009.0, + "step": 2971 + }, + { + "epoch": 4.0546232383472205, + "grad_norm": 0.20560902151810134, + "learning_rate": 1.1243543115435376e-05, + "loss": 0.2237, + "num_tokens": 2266590447.0, + "step": 2972 + }, + { + "epoch": 4.055988819305901, + "grad_norm": 0.23235024654073558, + "learning_rate": 1.1238574024414268e-05, + "loss": 0.2354, + "num_tokens": 2267347558.0, + "step": 2973 + }, + { + "epoch": 4.057354400264582, + "grad_norm": 0.2113246169791323, + "learning_rate": 1.123360486061456e-05, + "loss": 0.2316, + "num_tokens": 2268090185.0, + "step": 2974 + }, + { + "epoch": 4.058719981223262, + "grad_norm": 0.21350457315541654, + "learning_rate": 1.1228635625552124e-05, + "loss": 0.2402, + "num_tokens": 2268829092.0, + "step": 2975 + }, + { + "epoch": 4.060085562181943, + "grad_norm": 0.2123411775104535, + "learning_rate": 1.1223666320742858e-05, + "loss": 0.2254, + "num_tokens": 2269535727.0, + "step": 2976 + }, + { + "epoch": 4.061451143140623, + "grad_norm": 0.21499485670566587, + "learning_rate": 1.1218696947702688e-05, + "loss": 0.2346, + "num_tokens": 2270270478.0, + "step": 2977 + }, + { + "epoch": 4.062816724099303, + "grad_norm": 0.20167318044401317, + "learning_rate": 1.1213727507947547e-05, + "loss": 0.2342, + "num_tokens": 2271085462.0, + "step": 2978 + }, + { + "epoch": 4.064182305057984, + "grad_norm": 0.2189921301777101, + "learning_rate": 1.1208758002993403e-05, + "loss": 0.2264, + "num_tokens": 2271772482.0, + "step": 2979 + }, + { + "epoch": 4.065547886016664, + "grad_norm": 0.22366448383397522, + "learning_rate": 1.120378843435623e-05, + "loss": 0.2366, + "num_tokens": 2272494755.0, + "step": 2980 + }, + { + "epoch": 4.066913466975345, + "grad_norm": 0.20274208786727263, + "learning_rate": 1.119881880355203e-05, + "loss": 0.2427, + "num_tokens": 2273323102.0, + "step": 2981 + }, + { + "epoch": 4.068279047934025, + "grad_norm": 0.19959727507650785, + "learning_rate": 1.1193849112096827e-05, + "loss": 0.2287, + "num_tokens": 2274073193.0, + "step": 2982 + }, + { + "epoch": 4.069644628892706, + "grad_norm": 0.20115044330998796, + "learning_rate": 1.118887936150665e-05, + "loss": 0.2345, + "num_tokens": 2274874460.0, + "step": 2983 + }, + { + "epoch": 4.071010209851386, + "grad_norm": 0.2097136680678643, + "learning_rate": 1.1183909553297564e-05, + "loss": 0.2404, + "num_tokens": 2275574966.0, + "step": 2984 + }, + { + "epoch": 4.072375790810067, + "grad_norm": 0.20026519486414557, + "learning_rate": 1.1178939688985633e-05, + "loss": 0.2407, + "num_tokens": 2276343888.0, + "step": 2985 + }, + { + "epoch": 4.073741371768747, + "grad_norm": 0.23382542737896203, + "learning_rate": 1.1173969770086957e-05, + "loss": 0.2216, + "num_tokens": 2277085124.0, + "step": 2986 + }, + { + "epoch": 4.0751069527274275, + "grad_norm": 0.18997779175287827, + "learning_rate": 1.1168999798117635e-05, + "loss": 0.2382, + "num_tokens": 2277906360.0, + "step": 2987 + }, + { + "epoch": 4.0764725336861085, + "grad_norm": 0.21599336846742037, + "learning_rate": 1.1164029774593791e-05, + "loss": 0.2358, + "num_tokens": 2278676149.0, + "step": 2988 + }, + { + "epoch": 4.077838114644789, + "grad_norm": 0.19831287778439152, + "learning_rate": 1.1159059701031571e-05, + "loss": 0.2376, + "num_tokens": 2279536253.0, + "step": 2989 + }, + { + "epoch": 4.07920369560347, + "grad_norm": 0.19533853289510084, + "learning_rate": 1.1154089578947123e-05, + "loss": 0.2341, + "num_tokens": 2280345870.0, + "step": 2990 + }, + { + "epoch": 4.08056927656215, + "grad_norm": 0.19971431101126547, + "learning_rate": 1.1149119409856617e-05, + "loss": 0.2282, + "num_tokens": 2281166375.0, + "step": 2991 + }, + { + "epoch": 4.081934857520831, + "grad_norm": 0.20992421928215338, + "learning_rate": 1.1144149195276237e-05, + "loss": 0.237, + "num_tokens": 2281955896.0, + "step": 2992 + }, + { + "epoch": 4.083300438479511, + "grad_norm": 0.19182293730256667, + "learning_rate": 1.113917893672218e-05, + "loss": 0.2291, + "num_tokens": 2282819016.0, + "step": 2993 + }, + { + "epoch": 4.084666019438192, + "grad_norm": 0.20195617196831128, + "learning_rate": 1.1134208635710657e-05, + "loss": 0.2376, + "num_tokens": 2283607744.0, + "step": 2994 + }, + { + "epoch": 4.086031600396872, + "grad_norm": 0.20433530698770025, + "learning_rate": 1.1129238293757891e-05, + "loss": 0.2307, + "num_tokens": 2284358032.0, + "step": 2995 + }, + { + "epoch": 4.087397181355552, + "grad_norm": 0.21619160180426805, + "learning_rate": 1.112426791238012e-05, + "loss": 0.2302, + "num_tokens": 2285100283.0, + "step": 2996 + }, + { + "epoch": 4.088762762314233, + "grad_norm": 0.19818609925744374, + "learning_rate": 1.1119297493093588e-05, + "loss": 0.2297, + "num_tokens": 2285846988.0, + "step": 2997 + }, + { + "epoch": 4.090128343272913, + "grad_norm": 0.20212439243901867, + "learning_rate": 1.111432703741456e-05, + "loss": 0.2325, + "num_tokens": 2286620684.0, + "step": 2998 + }, + { + "epoch": 4.091493924231594, + "grad_norm": 0.19034934487831467, + "learning_rate": 1.11093565468593e-05, + "loss": 0.2294, + "num_tokens": 2287384151.0, + "step": 2999 + }, + { + "epoch": 4.092859505190274, + "grad_norm": 0.22016614257432005, + "learning_rate": 1.1104386022944096e-05, + "loss": 0.2302, + "num_tokens": 2288137806.0, + "step": 3000 + }, + { + "epoch": 4.094225086148955, + "grad_norm": 0.20887131337377915, + "learning_rate": 1.1099415467185237e-05, + "loss": 0.2451, + "num_tokens": 2288919886.0, + "step": 3001 + }, + { + "epoch": 4.095590667107635, + "grad_norm": 0.26431269182398626, + "learning_rate": 1.1094444881099025e-05, + "loss": 0.2276, + "num_tokens": 2289640952.0, + "step": 3002 + }, + { + "epoch": 4.096956248066316, + "grad_norm": 0.21373615018941874, + "learning_rate": 1.1089474266201769e-05, + "loss": 0.2243, + "num_tokens": 2290344618.0, + "step": 3003 + }, + { + "epoch": 4.0983218290249965, + "grad_norm": 0.23399530516651257, + "learning_rate": 1.1084503624009787e-05, + "loss": 0.2391, + "num_tokens": 2291000956.0, + "step": 3004 + }, + { + "epoch": 4.099687409983677, + "grad_norm": 0.20161223947388524, + "learning_rate": 1.1079532956039413e-05, + "loss": 0.2289, + "num_tokens": 2291746004.0, + "step": 3005 + }, + { + "epoch": 4.101052990942358, + "grad_norm": 0.21269249421310113, + "learning_rate": 1.1074562263806977e-05, + "loss": 0.2371, + "num_tokens": 2292480613.0, + "step": 3006 + }, + { + "epoch": 4.102418571901038, + "grad_norm": 0.34864210026694215, + "learning_rate": 1.106959154882882e-05, + "loss": 0.2482, + "num_tokens": 2293296004.0, + "step": 3007 + }, + { + "epoch": 4.103784152859719, + "grad_norm": 0.20654039946391828, + "learning_rate": 1.1064620812621299e-05, + "loss": 0.2332, + "num_tokens": 2294045155.0, + "step": 3008 + }, + { + "epoch": 4.105149733818399, + "grad_norm": 0.20936868170671807, + "learning_rate": 1.1059650056700763e-05, + "loss": 0.2376, + "num_tokens": 2294773432.0, + "step": 3009 + }, + { + "epoch": 4.10651531477708, + "grad_norm": 0.25975734440976667, + "learning_rate": 1.1054679282583583e-05, + "loss": 0.2371, + "num_tokens": 2295464793.0, + "step": 3010 + }, + { + "epoch": 4.10788089573576, + "grad_norm": 0.24863519191567104, + "learning_rate": 1.104970849178612e-05, + "loss": 0.2349, + "num_tokens": 2296242292.0, + "step": 3011 + }, + { + "epoch": 4.109246476694441, + "grad_norm": 0.21221002973890402, + "learning_rate": 1.1044737685824746e-05, + "loss": 0.2307, + "num_tokens": 2297010438.0, + "step": 3012 + }, + { + "epoch": 4.110612057653121, + "grad_norm": 0.2079083851834388, + "learning_rate": 1.1039766866215844e-05, + "loss": 0.2478, + "num_tokens": 2297848831.0, + "step": 3013 + }, + { + "epoch": 4.111977638611801, + "grad_norm": 0.21974156278439927, + "learning_rate": 1.1034796034475795e-05, + "loss": 0.2334, + "num_tokens": 2298581441.0, + "step": 3014 + }, + { + "epoch": 4.113343219570482, + "grad_norm": 0.2220385321033225, + "learning_rate": 1.1029825192120978e-05, + "loss": 0.2413, + "num_tokens": 2299342586.0, + "step": 3015 + }, + { + "epoch": 4.114708800529162, + "grad_norm": 0.2245205056810607, + "learning_rate": 1.102485434066779e-05, + "loss": 0.2338, + "num_tokens": 2300023907.0, + "step": 3016 + }, + { + "epoch": 4.116074381487843, + "grad_norm": 0.20875638652486952, + "learning_rate": 1.1019883481632618e-05, + "loss": 0.2515, + "num_tokens": 2300749921.0, + "step": 3017 + }, + { + "epoch": 4.117439962446523, + "grad_norm": 0.21414916099508424, + "learning_rate": 1.1014912616531858e-05, + "loss": 0.2387, + "num_tokens": 2301634068.0, + "step": 3018 + }, + { + "epoch": 4.118805543405204, + "grad_norm": 0.2021738663416521, + "learning_rate": 1.1009941746881903e-05, + "loss": 0.2324, + "num_tokens": 2302386679.0, + "step": 3019 + }, + { + "epoch": 4.1201711243638846, + "grad_norm": 0.22044489951305987, + "learning_rate": 1.1004970874199152e-05, + "loss": 0.2372, + "num_tokens": 2303181031.0, + "step": 3020 + }, + { + "epoch": 4.1215367053225656, + "grad_norm": 0.23148069915324562, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.2389, + "num_tokens": 2303965214.0, + "step": 3021 + }, + { + "epoch": 4.122902286281246, + "grad_norm": 0.20394877786838123, + "learning_rate": 1.0995029125800854e-05, + "loss": 0.2385, + "num_tokens": 2304706226.0, + "step": 3022 + }, + { + "epoch": 4.124267867239926, + "grad_norm": 0.21521157264487975, + "learning_rate": 1.09900582531181e-05, + "loss": 0.2258, + "num_tokens": 2305449469.0, + "step": 3023 + }, + { + "epoch": 4.125633448198607, + "grad_norm": 0.20771484509126392, + "learning_rate": 1.098508738346815e-05, + "loss": 0.2367, + "num_tokens": 2306175759.0, + "step": 3024 + }, + { + "epoch": 4.126999029157287, + "grad_norm": 0.22028533693698774, + "learning_rate": 1.0980116518367383e-05, + "loss": 0.2272, + "num_tokens": 2306857275.0, + "step": 3025 + }, + { + "epoch": 4.128364610115968, + "grad_norm": 0.22737495496610513, + "learning_rate": 1.0975145659332216e-05, + "loss": 0.237, + "num_tokens": 2307698780.0, + "step": 3026 + }, + { + "epoch": 4.129730191074648, + "grad_norm": 0.20107042611775644, + "learning_rate": 1.0970174807879023e-05, + "loss": 0.2323, + "num_tokens": 2308387770.0, + "step": 3027 + }, + { + "epoch": 4.131095772033329, + "grad_norm": 0.21356337464756264, + "learning_rate": 1.096520396552421e-05, + "loss": 0.2349, + "num_tokens": 2309184442.0, + "step": 3028 + }, + { + "epoch": 4.132461352992009, + "grad_norm": 0.21152912612828545, + "learning_rate": 1.0960233133784159e-05, + "loss": 0.2359, + "num_tokens": 2309934760.0, + "step": 3029 + }, + { + "epoch": 4.13382693395069, + "grad_norm": 0.21413171048858157, + "learning_rate": 1.0955262314175256e-05, + "loss": 0.24, + "num_tokens": 2310770840.0, + "step": 3030 + }, + { + "epoch": 4.13519251490937, + "grad_norm": 0.21260498689931373, + "learning_rate": 1.0950291508213886e-05, + "loss": 0.2301, + "num_tokens": 2311483471.0, + "step": 3031 + }, + { + "epoch": 4.13655809586805, + "grad_norm": 0.2025962812961191, + "learning_rate": 1.0945320717416421e-05, + "loss": 0.2406, + "num_tokens": 2312280668.0, + "step": 3032 + }, + { + "epoch": 4.137923676826731, + "grad_norm": 0.20384162225970578, + "learning_rate": 1.094034994329924e-05, + "loss": 0.2364, + "num_tokens": 2313026099.0, + "step": 3033 + }, + { + "epoch": 4.1392892577854115, + "grad_norm": 0.20681176141062596, + "learning_rate": 1.0935379187378704e-05, + "loss": 0.2392, + "num_tokens": 2313777789.0, + "step": 3034 + }, + { + "epoch": 4.1406548387440925, + "grad_norm": 0.20065539244099134, + "learning_rate": 1.0930408451171184e-05, + "loss": 0.2334, + "num_tokens": 2314593680.0, + "step": 3035 + }, + { + "epoch": 4.142020419702773, + "grad_norm": 0.22573382750884557, + "learning_rate": 1.0925437736193028e-05, + "loss": 0.2435, + "num_tokens": 2315338516.0, + "step": 3036 + }, + { + "epoch": 4.143386000661454, + "grad_norm": 0.20889935856263053, + "learning_rate": 1.092046704396059e-05, + "loss": 0.2334, + "num_tokens": 2316062639.0, + "step": 3037 + }, + { + "epoch": 4.144751581620134, + "grad_norm": 0.20089951816966278, + "learning_rate": 1.0915496375990215e-05, + "loss": 0.2297, + "num_tokens": 2316855557.0, + "step": 3038 + }, + { + "epoch": 4.146117162578815, + "grad_norm": 0.20241669095683282, + "learning_rate": 1.0910525733798234e-05, + "loss": 0.2336, + "num_tokens": 2317637883.0, + "step": 3039 + }, + { + "epoch": 4.147482743537495, + "grad_norm": 0.20179725072021962, + "learning_rate": 1.0905555118900979e-05, + "loss": 0.2357, + "num_tokens": 2318407581.0, + "step": 3040 + }, + { + "epoch": 4.148848324496175, + "grad_norm": 0.21925344051536316, + "learning_rate": 1.0900584532814766e-05, + "loss": 0.2428, + "num_tokens": 2319176345.0, + "step": 3041 + }, + { + "epoch": 4.150213905454856, + "grad_norm": 0.19972262253201878, + "learning_rate": 1.0895613977055908e-05, + "loss": 0.2397, + "num_tokens": 2319902039.0, + "step": 3042 + }, + { + "epoch": 4.151579486413536, + "grad_norm": 0.21847165264060467, + "learning_rate": 1.0890643453140702e-05, + "loss": 0.2402, + "num_tokens": 2320649678.0, + "step": 3043 + }, + { + "epoch": 4.152945067372217, + "grad_norm": 0.20672435870651132, + "learning_rate": 1.0885672962585443e-05, + "loss": 0.2352, + "num_tokens": 2321393010.0, + "step": 3044 + }, + { + "epoch": 4.154310648330897, + "grad_norm": 0.18754411335238574, + "learning_rate": 1.0880702506906418e-05, + "loss": 0.2363, + "num_tokens": 2322189304.0, + "step": 3045 + }, + { + "epoch": 4.155676229289578, + "grad_norm": 0.23884810056471587, + "learning_rate": 1.0875732087619884e-05, + "loss": 0.241, + "num_tokens": 2322994037.0, + "step": 3046 + }, + { + "epoch": 4.157041810248258, + "grad_norm": 0.22598991255324583, + "learning_rate": 1.0870761706242112e-05, + "loss": 0.2195, + "num_tokens": 2323715960.0, + "step": 3047 + }, + { + "epoch": 4.158407391206939, + "grad_norm": 0.20060243419477786, + "learning_rate": 1.0865791364289347e-05, + "loss": 0.2328, + "num_tokens": 2324404936.0, + "step": 3048 + }, + { + "epoch": 4.159772972165619, + "grad_norm": 0.2248131198820371, + "learning_rate": 1.0860821063277824e-05, + "loss": 0.2371, + "num_tokens": 2325143633.0, + "step": 3049 + }, + { + "epoch": 4.1611385531242995, + "grad_norm": 0.19441354638749594, + "learning_rate": 1.0855850804723767e-05, + "loss": 0.2299, + "num_tokens": 2325939768.0, + "step": 3050 + }, + { + "epoch": 4.1625041340829805, + "grad_norm": 0.21225564024239452, + "learning_rate": 1.0850880590143387e-05, + "loss": 0.2364, + "num_tokens": 2326707300.0, + "step": 3051 + }, + { + "epoch": 4.163869715041661, + "grad_norm": 0.19598277423915772, + "learning_rate": 1.0845910421052878e-05, + "loss": 0.2406, + "num_tokens": 2327559022.0, + "step": 3052 + }, + { + "epoch": 4.165235296000342, + "grad_norm": 0.2061929369454302, + "learning_rate": 1.0840940298968432e-05, + "loss": 0.2383, + "num_tokens": 2328305326.0, + "step": 3053 + }, + { + "epoch": 4.166600876959022, + "grad_norm": 0.21344630263274617, + "learning_rate": 1.0835970225406213e-05, + "loss": 0.2407, + "num_tokens": 2329065882.0, + "step": 3054 + }, + { + "epoch": 4.167966457917703, + "grad_norm": 0.2005535177505611, + "learning_rate": 1.083100020188237e-05, + "loss": 0.2369, + "num_tokens": 2329833080.0, + "step": 3055 + }, + { + "epoch": 4.169332038876383, + "grad_norm": 0.20869065443546297, + "learning_rate": 1.082603022991305e-05, + "loss": 0.242, + "num_tokens": 2330593100.0, + "step": 3056 + }, + { + "epoch": 4.170697619835064, + "grad_norm": 0.20516283692661014, + "learning_rate": 1.0821060311014368e-05, + "loss": 0.2492, + "num_tokens": 2331345023.0, + "step": 3057 + }, + { + "epoch": 4.172063200793744, + "grad_norm": 0.200393771532012, + "learning_rate": 1.0816090446702443e-05, + "loss": 0.2278, + "num_tokens": 2332059904.0, + "step": 3058 + }, + { + "epoch": 4.173428781752424, + "grad_norm": 0.20865356011847716, + "learning_rate": 1.0811120638493353e-05, + "loss": 0.2214, + "num_tokens": 2332774494.0, + "step": 3059 + }, + { + "epoch": 4.174794362711105, + "grad_norm": 0.20082147874317002, + "learning_rate": 1.080615088790318e-05, + "loss": 0.2279, + "num_tokens": 2333508237.0, + "step": 3060 + }, + { + "epoch": 4.176159943669785, + "grad_norm": 0.19885375270303932, + "learning_rate": 1.0801181196447975e-05, + "loss": 0.2367, + "num_tokens": 2334343237.0, + "step": 3061 + }, + { + "epoch": 4.177525524628466, + "grad_norm": 0.21133462373849365, + "learning_rate": 1.0796211565643773e-05, + "loss": 0.2338, + "num_tokens": 2335059906.0, + "step": 3062 + }, + { + "epoch": 4.178891105587146, + "grad_norm": 0.20702309773000258, + "learning_rate": 1.0791241997006603e-05, + "loss": 0.247, + "num_tokens": 2335900304.0, + "step": 3063 + }, + { + "epoch": 4.180256686545827, + "grad_norm": 0.21917674490134564, + "learning_rate": 1.0786272492052454e-05, + "loss": 0.2305, + "num_tokens": 2336629569.0, + "step": 3064 + }, + { + "epoch": 4.181622267504507, + "grad_norm": 0.20935642300242066, + "learning_rate": 1.0781303052297316e-05, + "loss": 0.2313, + "num_tokens": 2337330186.0, + "step": 3065 + }, + { + "epoch": 4.182987848463188, + "grad_norm": 0.1992655762736646, + "learning_rate": 1.0776333679257143e-05, + "loss": 0.2344, + "num_tokens": 2338140663.0, + "step": 3066 + }, + { + "epoch": 4.1843534294218685, + "grad_norm": 0.2121090168469975, + "learning_rate": 1.0771364374447878e-05, + "loss": 0.2348, + "num_tokens": 2338861753.0, + "step": 3067 + }, + { + "epoch": 4.185719010380549, + "grad_norm": 0.21660037780204572, + "learning_rate": 1.0766395139385443e-05, + "loss": 0.2316, + "num_tokens": 2339582314.0, + "step": 3068 + }, + { + "epoch": 4.18708459133923, + "grad_norm": 0.20805809975757786, + "learning_rate": 1.0761425975585735e-05, + "loss": 0.23, + "num_tokens": 2340273598.0, + "step": 3069 + }, + { + "epoch": 4.18845017229791, + "grad_norm": 0.2079837716099726, + "learning_rate": 1.075645688456463e-05, + "loss": 0.2321, + "num_tokens": 2341005832.0, + "step": 3070 + }, + { + "epoch": 4.189815753256591, + "grad_norm": 0.20598743450596463, + "learning_rate": 1.075148786783798e-05, + "loss": 0.2278, + "num_tokens": 2341756683.0, + "step": 3071 + }, + { + "epoch": 4.191181334215271, + "grad_norm": 0.21214065871272952, + "learning_rate": 1.074651892692162e-05, + "loss": 0.2408, + "num_tokens": 2342522329.0, + "step": 3072 + }, + { + "epoch": 4.192546915173952, + "grad_norm": 0.20393906945502938, + "learning_rate": 1.0741550063331358e-05, + "loss": 0.2457, + "num_tokens": 2343333443.0, + "step": 3073 + }, + { + "epoch": 4.193912496132632, + "grad_norm": 0.20964011962623855, + "learning_rate": 1.0736581278582983e-05, + "loss": 0.2342, + "num_tokens": 2344085941.0, + "step": 3074 + }, + { + "epoch": 4.195278077091313, + "grad_norm": 0.205432335308915, + "learning_rate": 1.0731612574192248e-05, + "loss": 0.2421, + "num_tokens": 2344867882.0, + "step": 3075 + }, + { + "epoch": 4.196643658049993, + "grad_norm": 0.20483511653651984, + "learning_rate": 1.0726643951674895e-05, + "loss": 0.2389, + "num_tokens": 2345677941.0, + "step": 3076 + }, + { + "epoch": 4.198009239008673, + "grad_norm": 0.20594712849069932, + "learning_rate": 1.0721675412546636e-05, + "loss": 0.2389, + "num_tokens": 2346378536.0, + "step": 3077 + }, + { + "epoch": 4.199374819967354, + "grad_norm": 0.20735257238572444, + "learning_rate": 1.0716706958323153e-05, + "loss": 0.2341, + "num_tokens": 2347149882.0, + "step": 3078 + }, + { + "epoch": 4.200740400926034, + "grad_norm": 0.21645782356358312, + "learning_rate": 1.0711738590520109e-05, + "loss": 0.2425, + "num_tokens": 2347844545.0, + "step": 3079 + }, + { + "epoch": 4.202105981884715, + "grad_norm": 0.2083967868462942, + "learning_rate": 1.0706770310653137e-05, + "loss": 0.2334, + "num_tokens": 2348620408.0, + "step": 3080 + }, + { + "epoch": 4.203471562843395, + "grad_norm": 0.2168416999625079, + "learning_rate": 1.0701802120237845e-05, + "loss": 0.2388, + "num_tokens": 2349333715.0, + "step": 3081 + }, + { + "epoch": 4.204837143802076, + "grad_norm": 0.1946529110591853, + "learning_rate": 1.0696834020789812e-05, + "loss": 0.2389, + "num_tokens": 2350105779.0, + "step": 3082 + }, + { + "epoch": 4.2062027247607565, + "grad_norm": 0.21922663210241894, + "learning_rate": 1.0691866013824593e-05, + "loss": 0.2431, + "num_tokens": 2350825441.0, + "step": 3083 + }, + { + "epoch": 4.2075683057194375, + "grad_norm": 0.21040965907356396, + "learning_rate": 1.0686898100857708e-05, + "loss": 0.2451, + "num_tokens": 2351638437.0, + "step": 3084 + }, + { + "epoch": 4.208933886678118, + "grad_norm": 0.20193490795667685, + "learning_rate": 1.068193028340465e-05, + "loss": 0.2338, + "num_tokens": 2352401818.0, + "step": 3085 + }, + { + "epoch": 4.210299467636798, + "grad_norm": 0.20976672578259514, + "learning_rate": 1.0676962562980893e-05, + "loss": 0.2317, + "num_tokens": 2353076321.0, + "step": 3086 + }, + { + "epoch": 4.211665048595479, + "grad_norm": 0.2158200670446393, + "learning_rate": 1.0671994941101864e-05, + "loss": 0.24, + "num_tokens": 2353869317.0, + "step": 3087 + }, + { + "epoch": 4.213030629554159, + "grad_norm": 0.21227295370896518, + "learning_rate": 1.0667027419282976e-05, + "loss": 0.2293, + "num_tokens": 2354641343.0, + "step": 3088 + }, + { + "epoch": 4.21439621051284, + "grad_norm": 0.19694899964490908, + "learning_rate": 1.0662059999039598e-05, + "loss": 0.2482, + "num_tokens": 2355491971.0, + "step": 3089 + }, + { + "epoch": 4.21576179147152, + "grad_norm": 0.2007243077750762, + "learning_rate": 1.0657092681887084e-05, + "loss": 0.2378, + "num_tokens": 2356264724.0, + "step": 3090 + }, + { + "epoch": 4.217127372430201, + "grad_norm": 0.20466828795644249, + "learning_rate": 1.0652125469340742e-05, + "loss": 0.2401, + "num_tokens": 2357088088.0, + "step": 3091 + }, + { + "epoch": 4.218492953388881, + "grad_norm": 0.22391692960079204, + "learning_rate": 1.0647158362915854e-05, + "loss": 0.2523, + "num_tokens": 2357854793.0, + "step": 3092 + }, + { + "epoch": 4.219858534347562, + "grad_norm": 0.20196338794581317, + "learning_rate": 1.0642191364127672e-05, + "loss": 0.2445, + "num_tokens": 2358669950.0, + "step": 3093 + }, + { + "epoch": 4.221224115306242, + "grad_norm": 0.20899101174786805, + "learning_rate": 1.0637224474491405e-05, + "loss": 0.2305, + "num_tokens": 2359433702.0, + "step": 3094 + }, + { + "epoch": 4.222589696264922, + "grad_norm": 0.190537831455725, + "learning_rate": 1.0632257695522243e-05, + "loss": 0.2402, + "num_tokens": 2360259523.0, + "step": 3095 + }, + { + "epoch": 4.223955277223603, + "grad_norm": 0.1980803021041769, + "learning_rate": 1.062729102873533e-05, + "loss": 0.2293, + "num_tokens": 2361010442.0, + "step": 3096 + }, + { + "epoch": 4.225320858182283, + "grad_norm": 0.1966411703073702, + "learning_rate": 1.0622324475645789e-05, + "loss": 0.2409, + "num_tokens": 2361781135.0, + "step": 3097 + }, + { + "epoch": 4.226686439140964, + "grad_norm": 0.21637400960642358, + "learning_rate": 1.0617358037768689e-05, + "loss": 0.2413, + "num_tokens": 2362599399.0, + "step": 3098 + }, + { + "epoch": 4.2280520200996445, + "grad_norm": 0.2061604804264618, + "learning_rate": 1.0612391716619081e-05, + "loss": 0.2337, + "num_tokens": 2363302966.0, + "step": 3099 + }, + { + "epoch": 4.2294176010583255, + "grad_norm": 0.19550469513994223, + "learning_rate": 1.0607425513711977e-05, + "loss": 0.234, + "num_tokens": 2364123972.0, + "step": 3100 + }, + { + "epoch": 4.230783182017006, + "grad_norm": 0.19493038231525944, + "learning_rate": 1.0602459430562348e-05, + "loss": 0.2254, + "num_tokens": 2364879335.0, + "step": 3101 + }, + { + "epoch": 4.232148762975687, + "grad_norm": 0.20462021460951182, + "learning_rate": 1.059749346868513e-05, + "loss": 0.2315, + "num_tokens": 2365698147.0, + "step": 3102 + }, + { + "epoch": 4.233514343934367, + "grad_norm": 0.21474196614100272, + "learning_rate": 1.0592527629595222e-05, + "loss": 0.2386, + "num_tokens": 2366446852.0, + "step": 3103 + }, + { + "epoch": 4.234879924893047, + "grad_norm": 0.1969899987509542, + "learning_rate": 1.058756191480749e-05, + "loss": 0.234, + "num_tokens": 2367199993.0, + "step": 3104 + }, + { + "epoch": 4.236245505851728, + "grad_norm": 0.21428092856227068, + "learning_rate": 1.0582596325836754e-05, + "loss": 0.2435, + "num_tokens": 2367975227.0, + "step": 3105 + }, + { + "epoch": 4.237611086810408, + "grad_norm": 0.20318132819512028, + "learning_rate": 1.0577630864197808e-05, + "loss": 0.2319, + "num_tokens": 2368717223.0, + "step": 3106 + }, + { + "epoch": 4.238976667769089, + "grad_norm": 0.22663260883454947, + "learning_rate": 1.0572665531405393e-05, + "loss": 0.2422, + "num_tokens": 2369437957.0, + "step": 3107 + }, + { + "epoch": 4.240342248727769, + "grad_norm": 0.1910331821684642, + "learning_rate": 1.056770032897421e-05, + "loss": 0.2391, + "num_tokens": 2370315977.0, + "step": 3108 + }, + { + "epoch": 4.24170782968645, + "grad_norm": 0.20847047091907853, + "learning_rate": 1.0562735258418943e-05, + "loss": 0.2442, + "num_tokens": 2371059578.0, + "step": 3109 + }, + { + "epoch": 4.24307341064513, + "grad_norm": 0.2164927402827901, + "learning_rate": 1.055777032125421e-05, + "loss": 0.239, + "num_tokens": 2371881213.0, + "step": 3110 + }, + { + "epoch": 4.244438991603811, + "grad_norm": 0.19065370483505387, + "learning_rate": 1.0552805518994604e-05, + "loss": 0.2448, + "num_tokens": 2372727248.0, + "step": 3111 + }, + { + "epoch": 4.245804572562491, + "grad_norm": 0.2132099726736033, + "learning_rate": 1.0547840853154663e-05, + "loss": 0.2368, + "num_tokens": 2373487194.0, + "step": 3112 + }, + { + "epoch": 4.247170153521171, + "grad_norm": 0.22539648099747198, + "learning_rate": 1.0542876325248898e-05, + "loss": 0.2419, + "num_tokens": 2374239350.0, + "step": 3113 + }, + { + "epoch": 4.248535734479852, + "grad_norm": 0.2061580866744798, + "learning_rate": 1.053791193679177e-05, + "loss": 0.2432, + "num_tokens": 2375006854.0, + "step": 3114 + }, + { + "epoch": 4.249901315438533, + "grad_norm": 0.20319103738308167, + "learning_rate": 1.0532947689297702e-05, + "loss": 0.2396, + "num_tokens": 2375811675.0, + "step": 3115 + }, + { + "epoch": 4.251266896397214, + "grad_norm": 0.21203571703425464, + "learning_rate": 1.0527983584281065e-05, + "loss": 0.2308, + "num_tokens": 2376578361.0, + "step": 3116 + }, + { + "epoch": 4.252632477355894, + "grad_norm": 0.20709345172654664, + "learning_rate": 1.0523019623256191e-05, + "loss": 0.2293, + "num_tokens": 2377266865.0, + "step": 3117 + }, + { + "epoch": 4.253998058314575, + "grad_norm": 0.22012829244602425, + "learning_rate": 1.051805580773738e-05, + "loss": 0.2372, + "num_tokens": 2378006051.0, + "step": 3118 + }, + { + "epoch": 4.255363639273255, + "grad_norm": 0.21757228619812713, + "learning_rate": 1.0513092139238868e-05, + "loss": 0.2348, + "num_tokens": 2378773515.0, + "step": 3119 + }, + { + "epoch": 4.256729220231936, + "grad_norm": 0.20403491850676322, + "learning_rate": 1.0508128619274858e-05, + "loss": 0.2414, + "num_tokens": 2379504325.0, + "step": 3120 + }, + { + "epoch": 4.258094801190616, + "grad_norm": 0.20634220418258029, + "learning_rate": 1.0503165249359504e-05, + "loss": 0.2341, + "num_tokens": 2380268033.0, + "step": 3121 + }, + { + "epoch": 4.259460382149296, + "grad_norm": 0.21142154749421194, + "learning_rate": 1.049820203100692e-05, + "loss": 0.2186, + "num_tokens": 2380969674.0, + "step": 3122 + }, + { + "epoch": 4.260825963107977, + "grad_norm": 0.20557494083720151, + "learning_rate": 1.0493238965731164e-05, + "loss": 0.2401, + "num_tokens": 2381724214.0, + "step": 3123 + }, + { + "epoch": 4.262191544066657, + "grad_norm": 0.2185561630493158, + "learning_rate": 1.048827605504625e-05, + "loss": 0.2406, + "num_tokens": 2382479549.0, + "step": 3124 + }, + { + "epoch": 4.263557125025338, + "grad_norm": 0.2136340245665799, + "learning_rate": 1.0483313300466157e-05, + "loss": 0.2385, + "num_tokens": 2383230172.0, + "step": 3125 + }, + { + "epoch": 4.264922705984018, + "grad_norm": 0.21772329313341057, + "learning_rate": 1.0478350703504794e-05, + "loss": 0.2336, + "num_tokens": 2383940168.0, + "step": 3126 + }, + { + "epoch": 4.266288286942699, + "grad_norm": 0.2196221576092348, + "learning_rate": 1.0473388265676044e-05, + "loss": 0.229, + "num_tokens": 2384682361.0, + "step": 3127 + }, + { + "epoch": 4.267653867901379, + "grad_norm": 0.19986364973988935, + "learning_rate": 1.0468425988493727e-05, + "loss": 0.2295, + "num_tokens": 2385467555.0, + "step": 3128 + }, + { + "epoch": 4.26901944886006, + "grad_norm": 0.2246800183907757, + "learning_rate": 1.0463463873471622e-05, + "loss": 0.241, + "num_tokens": 2386168634.0, + "step": 3129 + }, + { + "epoch": 4.2703850298187405, + "grad_norm": 0.22071674913394, + "learning_rate": 1.0458501922123454e-05, + "loss": 0.2303, + "num_tokens": 2386923642.0, + "step": 3130 + }, + { + "epoch": 4.271750610777421, + "grad_norm": 0.206462589627499, + "learning_rate": 1.0453540135962898e-05, + "loss": 0.2405, + "num_tokens": 2387756070.0, + "step": 3131 + }, + { + "epoch": 4.273116191736102, + "grad_norm": 0.19701788031258882, + "learning_rate": 1.0448578516503583e-05, + "loss": 0.2228, + "num_tokens": 2388485437.0, + "step": 3132 + }, + { + "epoch": 4.274481772694782, + "grad_norm": 0.2231102024361708, + "learning_rate": 1.044361706525908e-05, + "loss": 0.2337, + "num_tokens": 2389268384.0, + "step": 3133 + }, + { + "epoch": 4.275847353653463, + "grad_norm": 0.19923881354704373, + "learning_rate": 1.0438655783742919e-05, + "loss": 0.2382, + "num_tokens": 2389988167.0, + "step": 3134 + }, + { + "epoch": 4.277212934612143, + "grad_norm": 0.22460552365854378, + "learning_rate": 1.0433694673468564e-05, + "loss": 0.2385, + "num_tokens": 2390723404.0, + "step": 3135 + }, + { + "epoch": 4.278578515570824, + "grad_norm": 0.2227902728338538, + "learning_rate": 1.0428733735949444e-05, + "loss": 0.2366, + "num_tokens": 2391423418.0, + "step": 3136 + }, + { + "epoch": 4.279944096529504, + "grad_norm": 0.2231116027875236, + "learning_rate": 1.0423772972698918e-05, + "loss": 0.2409, + "num_tokens": 2392204783.0, + "step": 3137 + }, + { + "epoch": 4.281309677488185, + "grad_norm": 0.20626602628903765, + "learning_rate": 1.0418812385230305e-05, + "loss": 0.2343, + "num_tokens": 2392995363.0, + "step": 3138 + }, + { + "epoch": 4.282675258446865, + "grad_norm": 0.20258110015794845, + "learning_rate": 1.0413851975056868e-05, + "loss": 0.2305, + "num_tokens": 2393823741.0, + "step": 3139 + }, + { + "epoch": 4.284040839405545, + "grad_norm": 0.22402991385136467, + "learning_rate": 1.0408891743691805e-05, + "loss": 0.2372, + "num_tokens": 2394588195.0, + "step": 3140 + }, + { + "epoch": 4.285406420364226, + "grad_norm": 0.2084262540734205, + "learning_rate": 1.0403931692648272e-05, + "loss": 0.2436, + "num_tokens": 2395293177.0, + "step": 3141 + }, + { + "epoch": 4.286772001322906, + "grad_norm": 0.2152700258121204, + "learning_rate": 1.0398971823439365e-05, + "loss": 0.2505, + "num_tokens": 2396163039.0, + "step": 3142 + }, + { + "epoch": 4.288137582281587, + "grad_norm": 0.20905300767420062, + "learning_rate": 1.0394012137578134e-05, + "loss": 0.2306, + "num_tokens": 2396958148.0, + "step": 3143 + }, + { + "epoch": 4.289503163240267, + "grad_norm": 0.23328709792889446, + "learning_rate": 1.0389052636577548e-05, + "loss": 0.2404, + "num_tokens": 2397696994.0, + "step": 3144 + }, + { + "epoch": 4.290868744198948, + "grad_norm": 0.21807869380071418, + "learning_rate": 1.0384093321950547e-05, + "loss": 0.2337, + "num_tokens": 2398439653.0, + "step": 3145 + }, + { + "epoch": 4.2922343251576285, + "grad_norm": 0.2003066558491781, + "learning_rate": 1.0379134195210003e-05, + "loss": 0.2403, + "num_tokens": 2399220416.0, + "step": 3146 + }, + { + "epoch": 4.2935999061163095, + "grad_norm": 0.21146479468074358, + "learning_rate": 1.0374175257868728e-05, + "loss": 0.242, + "num_tokens": 2399974725.0, + "step": 3147 + }, + { + "epoch": 4.29496548707499, + "grad_norm": 0.2089664142422123, + "learning_rate": 1.0369216511439482e-05, + "loss": 0.2399, + "num_tokens": 2400766156.0, + "step": 3148 + }, + { + "epoch": 4.29633106803367, + "grad_norm": 0.21396785238387414, + "learning_rate": 1.0364257957434958e-05, + "loss": 0.2413, + "num_tokens": 2401474912.0, + "step": 3149 + }, + { + "epoch": 4.297696648992351, + "grad_norm": 0.22655708308340436, + "learning_rate": 1.03592995973678e-05, + "loss": 0.2348, + "num_tokens": 2402232037.0, + "step": 3150 + }, + { + "epoch": 4.299062229951031, + "grad_norm": 0.19453260614855458, + "learning_rate": 1.0354341432750587e-05, + "loss": 0.2314, + "num_tokens": 2403054864.0, + "step": 3151 + }, + { + "epoch": 4.300427810909712, + "grad_norm": 0.21400587071073163, + "learning_rate": 1.0349383465095845e-05, + "loss": 0.2362, + "num_tokens": 2403761055.0, + "step": 3152 + }, + { + "epoch": 4.301793391868392, + "grad_norm": 0.20024932788097385, + "learning_rate": 1.0344425695916029e-05, + "loss": 0.2376, + "num_tokens": 2404526195.0, + "step": 3153 + }, + { + "epoch": 4.303158972827073, + "grad_norm": 0.2067160122582665, + "learning_rate": 1.0339468126723546e-05, + "loss": 0.2405, + "num_tokens": 2405328612.0, + "step": 3154 + }, + { + "epoch": 4.304524553785753, + "grad_norm": 0.19658196290769328, + "learning_rate": 1.0334510759030732e-05, + "loss": 0.234, + "num_tokens": 2406075822.0, + "step": 3155 + }, + { + "epoch": 4.305890134744434, + "grad_norm": 0.19628473536842633, + "learning_rate": 1.0329553594349861e-05, + "loss": 0.2375, + "num_tokens": 2406934728.0, + "step": 3156 + }, + { + "epoch": 4.307255715703114, + "grad_norm": 0.20097333657444924, + "learning_rate": 1.032459663419316e-05, + "loss": 0.2319, + "num_tokens": 2407688977.0, + "step": 3157 + }, + { + "epoch": 4.308621296661794, + "grad_norm": 0.19747013620311202, + "learning_rate": 1.0319639880072772e-05, + "loss": 0.2301, + "num_tokens": 2408497714.0, + "step": 3158 + }, + { + "epoch": 4.309986877620475, + "grad_norm": 0.20800911509942782, + "learning_rate": 1.0314683333500795e-05, + "loss": 0.2383, + "num_tokens": 2409221552.0, + "step": 3159 + }, + { + "epoch": 4.311352458579155, + "grad_norm": 0.20078328818355712, + "learning_rate": 1.0309726995989251e-05, + "loss": 0.2292, + "num_tokens": 2410012151.0, + "step": 3160 + }, + { + "epoch": 4.312718039537836, + "grad_norm": 0.20614611838720331, + "learning_rate": 1.0304770869050111e-05, + "loss": 0.2424, + "num_tokens": 2410855500.0, + "step": 3161 + }, + { + "epoch": 4.3140836204965165, + "grad_norm": 0.21321262504507643, + "learning_rate": 1.0299814954195272e-05, + "loss": 0.233, + "num_tokens": 2411571588.0, + "step": 3162 + }, + { + "epoch": 4.3154492014551975, + "grad_norm": 0.20271265101863933, + "learning_rate": 1.0294859252936564e-05, + "loss": 0.2445, + "num_tokens": 2412345733.0, + "step": 3163 + }, + { + "epoch": 4.316814782413878, + "grad_norm": 0.20768513136762629, + "learning_rate": 1.0289903766785765e-05, + "loss": 0.2344, + "num_tokens": 2413037575.0, + "step": 3164 + }, + { + "epoch": 4.318180363372559, + "grad_norm": 0.20496357955517733, + "learning_rate": 1.0284948497254573e-05, + "loss": 0.2439, + "num_tokens": 2413823654.0, + "step": 3165 + }, + { + "epoch": 4.319545944331239, + "grad_norm": 0.21226865854034346, + "learning_rate": 1.0279993445854629e-05, + "loss": 0.238, + "num_tokens": 2414512563.0, + "step": 3166 + }, + { + "epoch": 4.320911525289919, + "grad_norm": 0.21760319472802733, + "learning_rate": 1.02750386140975e-05, + "loss": 0.2399, + "num_tokens": 2415234866.0, + "step": 3167 + }, + { + "epoch": 4.3222771062486, + "grad_norm": 0.20667083464791902, + "learning_rate": 1.02700840034947e-05, + "loss": 0.2419, + "num_tokens": 2415958555.0, + "step": 3168 + }, + { + "epoch": 4.32364268720728, + "grad_norm": 0.20095148506544197, + "learning_rate": 1.0265129615557654e-05, + "loss": 0.2406, + "num_tokens": 2416748290.0, + "step": 3169 + }, + { + "epoch": 4.325008268165961, + "grad_norm": 0.2071294201450318, + "learning_rate": 1.0260175451797745e-05, + "loss": 0.2443, + "num_tokens": 2417495153.0, + "step": 3170 + }, + { + "epoch": 4.326373849124641, + "grad_norm": 0.20614911034029376, + "learning_rate": 1.0255221513726266e-05, + "loss": 0.241, + "num_tokens": 2418280297.0, + "step": 3171 + }, + { + "epoch": 4.327739430083322, + "grad_norm": 0.2648824453267282, + "learning_rate": 1.0250267802854443e-05, + "loss": 0.2236, + "num_tokens": 2418994602.0, + "step": 3172 + }, + { + "epoch": 4.329105011042002, + "grad_norm": 0.2094247959596038, + "learning_rate": 1.0245314320693452e-05, + "loss": 0.2299, + "num_tokens": 2419688727.0, + "step": 3173 + }, + { + "epoch": 4.330470592000683, + "grad_norm": 0.19783690675715898, + "learning_rate": 1.0240361068754377e-05, + "loss": 0.2202, + "num_tokens": 2420412511.0, + "step": 3174 + }, + { + "epoch": 4.331836172959363, + "grad_norm": 0.22172887457051177, + "learning_rate": 1.0235408048548241e-05, + "loss": 0.2404, + "num_tokens": 2421153131.0, + "step": 3175 + }, + { + "epoch": 4.333201753918043, + "grad_norm": 0.20572281948309226, + "learning_rate": 1.0230455261585999e-05, + "loss": 0.2341, + "num_tokens": 2421864895.0, + "step": 3176 + }, + { + "epoch": 4.334567334876724, + "grad_norm": 0.20894601567127416, + "learning_rate": 1.0225502709378529e-05, + "loss": 0.2384, + "num_tokens": 2422672677.0, + "step": 3177 + }, + { + "epoch": 4.3359329158354045, + "grad_norm": 0.21486751628070896, + "learning_rate": 1.0220550393436646e-05, + "loss": 0.2327, + "num_tokens": 2423380998.0, + "step": 3178 + }, + { + "epoch": 4.3372984967940855, + "grad_norm": 0.19765620045895044, + "learning_rate": 1.0215598315271076e-05, + "loss": 0.2325, + "num_tokens": 2424129863.0, + "step": 3179 + }, + { + "epoch": 4.338664077752766, + "grad_norm": 0.2059990760145127, + "learning_rate": 1.0210646476392498e-05, + "loss": 0.2319, + "num_tokens": 2424915607.0, + "step": 3180 + }, + { + "epoch": 4.340029658711447, + "grad_norm": 0.19399451821854877, + "learning_rate": 1.020569487831149e-05, + "loss": 0.2436, + "num_tokens": 2425783830.0, + "step": 3181 + }, + { + "epoch": 4.341395239670127, + "grad_norm": 0.20282382613045738, + "learning_rate": 1.0200743522538579e-05, + "loss": 0.2342, + "num_tokens": 2426582468.0, + "step": 3182 + }, + { + "epoch": 4.342760820628808, + "grad_norm": 0.20726209468098505, + "learning_rate": 1.0195792410584206e-05, + "loss": 0.2411, + "num_tokens": 2427393030.0, + "step": 3183 + }, + { + "epoch": 4.344126401587488, + "grad_norm": 0.18939424772470798, + "learning_rate": 1.0190841543958743e-05, + "loss": 0.2382, + "num_tokens": 2428214398.0, + "step": 3184 + }, + { + "epoch": 4.345491982546168, + "grad_norm": 0.20025748370346355, + "learning_rate": 1.018589092417248e-05, + "loss": 0.2575, + "num_tokens": 2429001351.0, + "step": 3185 + }, + { + "epoch": 4.346857563504849, + "grad_norm": 0.21525428778025574, + "learning_rate": 1.0180940552735643e-05, + "loss": 0.2262, + "num_tokens": 2429752578.0, + "step": 3186 + }, + { + "epoch": 4.348223144463529, + "grad_norm": 0.1982954868386459, + "learning_rate": 1.0175990431158374e-05, + "loss": 0.2397, + "num_tokens": 2430512562.0, + "step": 3187 + }, + { + "epoch": 4.34958872542221, + "grad_norm": 0.21795621173091057, + "learning_rate": 1.0171040560950733e-05, + "loss": 0.2416, + "num_tokens": 2431256168.0, + "step": 3188 + }, + { + "epoch": 4.35095430638089, + "grad_norm": 0.19452179481071497, + "learning_rate": 1.0166090943622722e-05, + "loss": 0.2356, + "num_tokens": 2432042331.0, + "step": 3189 + }, + { + "epoch": 4.352319887339571, + "grad_norm": 0.20489150457548222, + "learning_rate": 1.016114158068425e-05, + "loss": 0.2316, + "num_tokens": 2432800248.0, + "step": 3190 + }, + { + "epoch": 4.353685468298251, + "grad_norm": 0.2059221878183246, + "learning_rate": 1.0156192473645155e-05, + "loss": 0.2297, + "num_tokens": 2433530159.0, + "step": 3191 + }, + { + "epoch": 4.355051049256932, + "grad_norm": 0.190457295859458, + "learning_rate": 1.0151243624015193e-05, + "loss": 0.2373, + "num_tokens": 2434315894.0, + "step": 3192 + }, + { + "epoch": 4.356416630215612, + "grad_norm": 0.21550763986197916, + "learning_rate": 1.0146295033304046e-05, + "loss": 0.2405, + "num_tokens": 2435099043.0, + "step": 3193 + }, + { + "epoch": 4.3577822111742925, + "grad_norm": 0.19677757105908222, + "learning_rate": 1.0141346703021315e-05, + "loss": 0.2397, + "num_tokens": 2435958479.0, + "step": 3194 + }, + { + "epoch": 4.3591477921329735, + "grad_norm": 0.19685748179310872, + "learning_rate": 1.0136398634676517e-05, + "loss": 0.2384, + "num_tokens": 2436683495.0, + "step": 3195 + }, + { + "epoch": 4.360513373091654, + "grad_norm": 0.2166413069222805, + "learning_rate": 1.0131450829779097e-05, + "loss": 0.2447, + "num_tokens": 2437404356.0, + "step": 3196 + }, + { + "epoch": 4.361878954050335, + "grad_norm": 0.198582164669502, + "learning_rate": 1.0126503289838412e-05, + "loss": 0.2251, + "num_tokens": 2438129504.0, + "step": 3197 + }, + { + "epoch": 4.363244535009015, + "grad_norm": 0.2142279846458367, + "learning_rate": 1.012155601636375e-05, + "loss": 0.2378, + "num_tokens": 2438825462.0, + "step": 3198 + }, + { + "epoch": 4.364610115967696, + "grad_norm": 0.22398051421244974, + "learning_rate": 1.01166090108643e-05, + "loss": 0.2401, + "num_tokens": 2439576051.0, + "step": 3199 + }, + { + "epoch": 4.365975696926376, + "grad_norm": 0.2034821742452059, + "learning_rate": 1.011166227484919e-05, + "loss": 0.2468, + "num_tokens": 2440383564.0, + "step": 3200 + }, + { + "epoch": 4.367341277885057, + "grad_norm": 0.19885246061455455, + "learning_rate": 1.0106715809827447e-05, + "loss": 0.232, + "num_tokens": 2441081604.0, + "step": 3201 + }, + { + "epoch": 4.368706858843737, + "grad_norm": 0.2188221875323839, + "learning_rate": 1.0101769617308026e-05, + "loss": 0.2336, + "num_tokens": 2441844908.0, + "step": 3202 + }, + { + "epoch": 4.370072439802417, + "grad_norm": 0.2092353099898902, + "learning_rate": 1.0096823698799795e-05, + "loss": 0.2364, + "num_tokens": 2442576566.0, + "step": 3203 + }, + { + "epoch": 4.371438020761098, + "grad_norm": 0.195727479191234, + "learning_rate": 1.0091878055811542e-05, + "loss": 0.2308, + "num_tokens": 2443362562.0, + "step": 3204 + }, + { + "epoch": 4.372803601719778, + "grad_norm": 0.2134476596014767, + "learning_rate": 1.0086932689851965e-05, + "loss": 0.2427, + "num_tokens": 2444184204.0, + "step": 3205 + }, + { + "epoch": 4.374169182678459, + "grad_norm": 0.22100931039205451, + "learning_rate": 1.0081987602429682e-05, + "loss": 0.2434, + "num_tokens": 2444981261.0, + "step": 3206 + }, + { + "epoch": 4.375534763637139, + "grad_norm": 0.2029730722178828, + "learning_rate": 1.0077042795053227e-05, + "loss": 0.2352, + "num_tokens": 2445796301.0, + "step": 3207 + }, + { + "epoch": 4.37690034459582, + "grad_norm": 0.2020227558159363, + "learning_rate": 1.0072098269231043e-05, + "loss": 0.2307, + "num_tokens": 2446613119.0, + "step": 3208 + }, + { + "epoch": 4.3782659255545004, + "grad_norm": 0.20150488223791843, + "learning_rate": 1.0067154026471497e-05, + "loss": 0.2431, + "num_tokens": 2447398254.0, + "step": 3209 + }, + { + "epoch": 4.3796315065131814, + "grad_norm": 0.21099602413069868, + "learning_rate": 1.0062210068282859e-05, + "loss": 0.2321, + "num_tokens": 2448110862.0, + "step": 3210 + }, + { + "epoch": 4.380997087471862, + "grad_norm": 0.20908240447364793, + "learning_rate": 1.0057266396173315e-05, + "loss": 0.2242, + "num_tokens": 2448794908.0, + "step": 3211 + }, + { + "epoch": 4.382362668430542, + "grad_norm": 0.18801575799228207, + "learning_rate": 1.0052323011650969e-05, + "loss": 0.2333, + "num_tokens": 2449577799.0, + "step": 3212 + }, + { + "epoch": 4.383728249389223, + "grad_norm": 0.22489598773748104, + "learning_rate": 1.004737991622383e-05, + "loss": 0.2458, + "num_tokens": 2450407875.0, + "step": 3213 + }, + { + "epoch": 4.385093830347903, + "grad_norm": 0.21934138708502926, + "learning_rate": 1.0042437111399825e-05, + "loss": 0.2449, + "num_tokens": 2451142631.0, + "step": 3214 + }, + { + "epoch": 4.386459411306584, + "grad_norm": 0.20893512927754052, + "learning_rate": 1.0037494598686785e-05, + "loss": 0.2396, + "num_tokens": 2451907223.0, + "step": 3215 + }, + { + "epoch": 4.387824992265264, + "grad_norm": 0.21434253921789745, + "learning_rate": 1.0032552379592467e-05, + "loss": 0.2353, + "num_tokens": 2452625317.0, + "step": 3216 + }, + { + "epoch": 4.389190573223945, + "grad_norm": 0.2119410518015286, + "learning_rate": 1.0027610455624517e-05, + "loss": 0.2314, + "num_tokens": 2453378767.0, + "step": 3217 + }, + { + "epoch": 4.390556154182625, + "grad_norm": 0.21634134716854447, + "learning_rate": 1.0022668828290508e-05, + "loss": 0.2351, + "num_tokens": 2454101567.0, + "step": 3218 + }, + { + "epoch": 4.391921735141306, + "grad_norm": 0.2135340501304843, + "learning_rate": 1.0017727499097916e-05, + "loss": 0.2417, + "num_tokens": 2454837984.0, + "step": 3219 + }, + { + "epoch": 4.393287316099986, + "grad_norm": 0.18534037279818108, + "learning_rate": 1.001278646955412e-05, + "loss": 0.2402, + "num_tokens": 2455643241.0, + "step": 3220 + }, + { + "epoch": 4.394652897058666, + "grad_norm": 0.20852008140909967, + "learning_rate": 1.0007845741166427e-05, + "loss": 0.2478, + "num_tokens": 2456412463.0, + "step": 3221 + }, + { + "epoch": 4.396018478017347, + "grad_norm": 0.2234845081015311, + "learning_rate": 1.0002905315442024e-05, + "loss": 0.2424, + "num_tokens": 2457147773.0, + "step": 3222 + }, + { + "epoch": 4.397384058976027, + "grad_norm": 0.19511750536196829, + "learning_rate": 9.997965193888038e-06, + "loss": 0.2278, + "num_tokens": 2457841609.0, + "step": 3223 + }, + { + "epoch": 4.398749639934708, + "grad_norm": 0.21038812018173747, + "learning_rate": 9.993025378011467e-06, + "loss": 0.2359, + "num_tokens": 2458671666.0, + "step": 3224 + }, + { + "epoch": 4.4001152208933885, + "grad_norm": 0.19728437399231036, + "learning_rate": 9.988085869319252e-06, + "loss": 0.2365, + "num_tokens": 2459428955.0, + "step": 3225 + }, + { + "epoch": 4.4014808018520695, + "grad_norm": 0.21251665627832095, + "learning_rate": 9.983146669318215e-06, + "loss": 0.2359, + "num_tokens": 2460115392.0, + "step": 3226 + }, + { + "epoch": 4.40284638281075, + "grad_norm": 0.229044435398285, + "learning_rate": 9.97820777951509e-06, + "loss": 0.2427, + "num_tokens": 2460839201.0, + "step": 3227 + }, + { + "epoch": 4.404211963769431, + "grad_norm": 0.20574494216169656, + "learning_rate": 9.973269201416524e-06, + "loss": 0.2353, + "num_tokens": 2461582259.0, + "step": 3228 + }, + { + "epoch": 4.405577544728111, + "grad_norm": 0.21740023731402494, + "learning_rate": 9.96833093652906e-06, + "loss": 0.247, + "num_tokens": 2462362976.0, + "step": 3229 + }, + { + "epoch": 4.406943125686791, + "grad_norm": 0.19064979701411286, + "learning_rate": 9.96339298635915e-06, + "loss": 0.2318, + "num_tokens": 2463196713.0, + "step": 3230 + }, + { + "epoch": 4.408308706645472, + "grad_norm": 0.2182592900958331, + "learning_rate": 9.958455352413145e-06, + "loss": 0.2469, + "num_tokens": 2463916038.0, + "step": 3231 + }, + { + "epoch": 4.409674287604152, + "grad_norm": 0.2103058438889468, + "learning_rate": 9.953518036197312e-06, + "loss": 0.2389, + "num_tokens": 2464656861.0, + "step": 3232 + }, + { + "epoch": 4.411039868562833, + "grad_norm": 0.2023554176311186, + "learning_rate": 9.948581039217806e-06, + "loss": 0.2332, + "num_tokens": 2465406953.0, + "step": 3233 + }, + { + "epoch": 4.412405449521513, + "grad_norm": 0.21048809218798498, + "learning_rate": 9.943644362980687e-06, + "loss": 0.2236, + "num_tokens": 2466173926.0, + "step": 3234 + }, + { + "epoch": 4.413771030480194, + "grad_norm": 0.2019177209699721, + "learning_rate": 9.938708008991928e-06, + "loss": 0.2362, + "num_tokens": 2466924657.0, + "step": 3235 + }, + { + "epoch": 4.415136611438874, + "grad_norm": 0.21595055939574123, + "learning_rate": 9.933771978757396e-06, + "loss": 0.2395, + "num_tokens": 2467719516.0, + "step": 3236 + }, + { + "epoch": 4.416502192397555, + "grad_norm": 0.2121565972395305, + "learning_rate": 9.92883627378286e-06, + "loss": 0.2375, + "num_tokens": 2468461885.0, + "step": 3237 + }, + { + "epoch": 4.417867773356235, + "grad_norm": 0.21919083427897634, + "learning_rate": 9.923900895573986e-06, + "loss": 0.2382, + "num_tokens": 2469170003.0, + "step": 3238 + }, + { + "epoch": 4.419233354314915, + "grad_norm": 0.20824971279247714, + "learning_rate": 9.918965845636348e-06, + "loss": 0.2296, + "num_tokens": 2469924858.0, + "step": 3239 + }, + { + "epoch": 4.420598935273596, + "grad_norm": 0.2183487509589547, + "learning_rate": 9.914031125475415e-06, + "loss": 0.2322, + "num_tokens": 2470662621.0, + "step": 3240 + }, + { + "epoch": 4.4219645162322765, + "grad_norm": 0.19647849103113701, + "learning_rate": 9.909096736596557e-06, + "loss": 0.2369, + "num_tokens": 2471440349.0, + "step": 3241 + }, + { + "epoch": 4.4233300971909575, + "grad_norm": 0.21332368129201937, + "learning_rate": 9.904162680505044e-06, + "loss": 0.2299, + "num_tokens": 2472177764.0, + "step": 3242 + }, + { + "epoch": 4.424695678149638, + "grad_norm": 0.2111524598451786, + "learning_rate": 9.89922895870603e-06, + "loss": 0.2365, + "num_tokens": 2472907886.0, + "step": 3243 + }, + { + "epoch": 4.426061259108319, + "grad_norm": 0.19820067040318498, + "learning_rate": 9.894295572704603e-06, + "loss": 0.2386, + "num_tokens": 2473720043.0, + "step": 3244 + }, + { + "epoch": 4.427426840066999, + "grad_norm": 0.20124821926463773, + "learning_rate": 9.889362524005706e-06, + "loss": 0.2279, + "num_tokens": 2474470385.0, + "step": 3245 + }, + { + "epoch": 4.42879242102568, + "grad_norm": 0.21317536260262898, + "learning_rate": 9.884429814114207e-06, + "loss": 0.2475, + "num_tokens": 2475282151.0, + "step": 3246 + }, + { + "epoch": 4.43015800198436, + "grad_norm": 0.20750393612863824, + "learning_rate": 9.87949744453486e-06, + "loss": 0.2482, + "num_tokens": 2476063741.0, + "step": 3247 + }, + { + "epoch": 4.43152358294304, + "grad_norm": 0.20594362347063608, + "learning_rate": 9.874565416772319e-06, + "loss": 0.235, + "num_tokens": 2476809266.0, + "step": 3248 + }, + { + "epoch": 4.432889163901721, + "grad_norm": 0.2168834010189078, + "learning_rate": 9.86963373233113e-06, + "loss": 0.2427, + "num_tokens": 2477520149.0, + "step": 3249 + }, + { + "epoch": 4.434254744860401, + "grad_norm": 0.20372742886844014, + "learning_rate": 9.864702392715734e-06, + "loss": 0.2438, + "num_tokens": 2478320412.0, + "step": 3250 + }, + { + "epoch": 4.435620325819082, + "grad_norm": 0.20486951899459122, + "learning_rate": 9.85977139943048e-06, + "loss": 0.243, + "num_tokens": 2479062242.0, + "step": 3251 + }, + { + "epoch": 4.436985906777762, + "grad_norm": 0.21228526721126598, + "learning_rate": 9.854840753979587e-06, + "loss": 0.2388, + "num_tokens": 2479839646.0, + "step": 3252 + }, + { + "epoch": 4.438351487736443, + "grad_norm": 0.19913430550378447, + "learning_rate": 9.84991045786719e-06, + "loss": 0.2407, + "num_tokens": 2480638959.0, + "step": 3253 + }, + { + "epoch": 4.439717068695123, + "grad_norm": 0.2017997618471524, + "learning_rate": 9.844980512597302e-06, + "loss": 0.2368, + "num_tokens": 2481365238.0, + "step": 3254 + }, + { + "epoch": 4.441082649653804, + "grad_norm": 0.20434163650274012, + "learning_rate": 9.840050919673845e-06, + "loss": 0.2405, + "num_tokens": 2482107740.0, + "step": 3255 + }, + { + "epoch": 4.442448230612484, + "grad_norm": 0.19822177518863007, + "learning_rate": 9.835121680600618e-06, + "loss": 0.239, + "num_tokens": 2482836297.0, + "step": 3256 + }, + { + "epoch": 4.4438138115711645, + "grad_norm": 0.21469634362688822, + "learning_rate": 9.83019279688132e-06, + "loss": 0.2443, + "num_tokens": 2483649352.0, + "step": 3257 + }, + { + "epoch": 4.4451793925298455, + "grad_norm": 0.21070001557931048, + "learning_rate": 9.825264270019538e-06, + "loss": 0.2332, + "num_tokens": 2484338254.0, + "step": 3258 + }, + { + "epoch": 4.446544973488526, + "grad_norm": 0.19699565545353745, + "learning_rate": 9.820336101518752e-06, + "loss": 0.2369, + "num_tokens": 2485088732.0, + "step": 3259 + }, + { + "epoch": 4.447910554447207, + "grad_norm": 0.1993014990435232, + "learning_rate": 9.81540829288234e-06, + "loss": 0.218, + "num_tokens": 2485799066.0, + "step": 3260 + }, + { + "epoch": 4.449276135405887, + "grad_norm": 0.17928026202144806, + "learning_rate": 9.810480845613553e-06, + "loss": 0.2357, + "num_tokens": 2486661791.0, + "step": 3261 + }, + { + "epoch": 4.450641716364568, + "grad_norm": 0.21149656833809066, + "learning_rate": 9.805553761215548e-06, + "loss": 0.2356, + "num_tokens": 2487353802.0, + "step": 3262 + }, + { + "epoch": 4.452007297323248, + "grad_norm": 0.21727423197286788, + "learning_rate": 9.800627041191362e-06, + "loss": 0.234, + "num_tokens": 2488128649.0, + "step": 3263 + }, + { + "epoch": 4.453372878281929, + "grad_norm": 0.19898821088816412, + "learning_rate": 9.795700687043928e-06, + "loss": 0.2384, + "num_tokens": 2488933739.0, + "step": 3264 + }, + { + "epoch": 4.454738459240609, + "grad_norm": 0.20650435184661378, + "learning_rate": 9.79077470027606e-06, + "loss": 0.252, + "num_tokens": 2489743130.0, + "step": 3265 + }, + { + "epoch": 4.456104040199289, + "grad_norm": 0.199590586910346, + "learning_rate": 9.785849082390462e-06, + "loss": 0.2401, + "num_tokens": 2490509967.0, + "step": 3266 + }, + { + "epoch": 4.45746962115797, + "grad_norm": 0.1930771290506454, + "learning_rate": 9.78092383488973e-06, + "loss": 0.2313, + "num_tokens": 2491287752.0, + "step": 3267 + }, + { + "epoch": 4.45883520211665, + "grad_norm": 0.21050522483695722, + "learning_rate": 9.77599895927634e-06, + "loss": 0.2527, + "num_tokens": 2492043005.0, + "step": 3268 + }, + { + "epoch": 4.460200783075331, + "grad_norm": 0.2129229154911381, + "learning_rate": 9.771074457052665e-06, + "loss": 0.2294, + "num_tokens": 2492739112.0, + "step": 3269 + }, + { + "epoch": 4.461566364034011, + "grad_norm": 0.2022771643785076, + "learning_rate": 9.766150329720947e-06, + "loss": 0.2432, + "num_tokens": 2493533341.0, + "step": 3270 + }, + { + "epoch": 4.462931944992692, + "grad_norm": 0.21981064931441902, + "learning_rate": 9.761226578783336e-06, + "loss": 0.2444, + "num_tokens": 2494303289.0, + "step": 3271 + }, + { + "epoch": 4.464297525951372, + "grad_norm": 0.2114809359313964, + "learning_rate": 9.756303205741846e-06, + "loss": 0.2435, + "num_tokens": 2495021527.0, + "step": 3272 + }, + { + "epoch": 4.465663106910053, + "grad_norm": 0.21435818822439512, + "learning_rate": 9.751380212098386e-06, + "loss": 0.2376, + "num_tokens": 2495732215.0, + "step": 3273 + }, + { + "epoch": 4.4670286878687335, + "grad_norm": 0.2084327439520346, + "learning_rate": 9.746457599354755e-06, + "loss": 0.2388, + "num_tokens": 2496477046.0, + "step": 3274 + }, + { + "epoch": 4.468394268827414, + "grad_norm": 0.19844860293568437, + "learning_rate": 9.74153536901262e-06, + "loss": 0.2311, + "num_tokens": 2497271207.0, + "step": 3275 + }, + { + "epoch": 4.469759849786095, + "grad_norm": 0.21147953293208258, + "learning_rate": 9.736613522573548e-06, + "loss": 0.2323, + "num_tokens": 2498050945.0, + "step": 3276 + }, + { + "epoch": 4.471125430744775, + "grad_norm": 0.19124672639813825, + "learning_rate": 9.731692061538973e-06, + "loss": 0.2323, + "num_tokens": 2498787668.0, + "step": 3277 + }, + { + "epoch": 4.472491011703456, + "grad_norm": 0.21026908842406816, + "learning_rate": 9.726770987410228e-06, + "loss": 0.2432, + "num_tokens": 2499536440.0, + "step": 3278 + }, + { + "epoch": 4.473856592662136, + "grad_norm": 0.20210406285693505, + "learning_rate": 9.721850301688518e-06, + "loss": 0.2297, + "num_tokens": 2500317033.0, + "step": 3279 + }, + { + "epoch": 4.475222173620817, + "grad_norm": 0.2235049615157691, + "learning_rate": 9.716930005874923e-06, + "loss": 0.2264, + "num_tokens": 2501050533.0, + "step": 3280 + }, + { + "epoch": 4.476587754579497, + "grad_norm": 0.209022674740525, + "learning_rate": 9.712010101470422e-06, + "loss": 0.2389, + "num_tokens": 2501802209.0, + "step": 3281 + }, + { + "epoch": 4.477953335538178, + "grad_norm": 0.1939644881159321, + "learning_rate": 9.707090589975858e-06, + "loss": 0.2264, + "num_tokens": 2502546403.0, + "step": 3282 + }, + { + "epoch": 4.479318916496858, + "grad_norm": 0.18758983732041834, + "learning_rate": 9.702171472891968e-06, + "loss": 0.2319, + "num_tokens": 2503320996.0, + "step": 3283 + }, + { + "epoch": 4.480684497455538, + "grad_norm": 0.18854466401622086, + "learning_rate": 9.697252751719355e-06, + "loss": 0.2378, + "num_tokens": 2504110952.0, + "step": 3284 + }, + { + "epoch": 4.482050078414219, + "grad_norm": 0.21339261827436723, + "learning_rate": 9.69233442795851e-06, + "loss": 0.2402, + "num_tokens": 2504822869.0, + "step": 3285 + }, + { + "epoch": 4.483415659372899, + "grad_norm": 0.20010072466409926, + "learning_rate": 9.687416503109798e-06, + "loss": 0.2447, + "num_tokens": 2505633146.0, + "step": 3286 + }, + { + "epoch": 4.48478124033158, + "grad_norm": 0.22004160438136497, + "learning_rate": 9.682498978673474e-06, + "loss": 0.2369, + "num_tokens": 2506383091.0, + "step": 3287 + }, + { + "epoch": 4.48614682129026, + "grad_norm": 0.20969941905338899, + "learning_rate": 9.677581856149654e-06, + "loss": 0.2402, + "num_tokens": 2507197599.0, + "step": 3288 + }, + { + "epoch": 4.487512402248941, + "grad_norm": 0.19788013714432823, + "learning_rate": 9.672665137038337e-06, + "loss": 0.2371, + "num_tokens": 2507937058.0, + "step": 3289 + }, + { + "epoch": 4.4888779832076215, + "grad_norm": 0.2099582888400549, + "learning_rate": 9.667748822839406e-06, + "loss": 0.2436, + "num_tokens": 2508703781.0, + "step": 3290 + }, + { + "epoch": 4.4902435641663025, + "grad_norm": 0.22515901315128767, + "learning_rate": 9.662832915052616e-06, + "loss": 0.2356, + "num_tokens": 2509409480.0, + "step": 3291 + }, + { + "epoch": 4.491609145124983, + "grad_norm": 0.2260261339761146, + "learning_rate": 9.657917415177596e-06, + "loss": 0.2339, + "num_tokens": 2510154327.0, + "step": 3292 + }, + { + "epoch": 4.492974726083663, + "grad_norm": 0.2022569419378603, + "learning_rate": 9.653002324713849e-06, + "loss": 0.2431, + "num_tokens": 2511004262.0, + "step": 3293 + }, + { + "epoch": 4.494340307042344, + "grad_norm": 0.20541387968765265, + "learning_rate": 9.648087645160763e-06, + "loss": 0.2401, + "num_tokens": 2511782251.0, + "step": 3294 + }, + { + "epoch": 4.495705888001024, + "grad_norm": 0.19875464995945022, + "learning_rate": 9.64317337801759e-06, + "loss": 0.2275, + "num_tokens": 2512435363.0, + "step": 3295 + }, + { + "epoch": 4.497071468959705, + "grad_norm": 0.21337728152647756, + "learning_rate": 9.638259524783455e-06, + "loss": 0.2451, + "num_tokens": 2513169168.0, + "step": 3296 + }, + { + "epoch": 4.498437049918385, + "grad_norm": 0.21168871412080995, + "learning_rate": 9.633346086957374e-06, + "loss": 0.2343, + "num_tokens": 2513949184.0, + "step": 3297 + }, + { + "epoch": 4.499802630877066, + "grad_norm": 0.20860766639760842, + "learning_rate": 9.62843306603821e-06, + "loss": 0.2405, + "num_tokens": 2514760142.0, + "step": 3298 + }, + { + "epoch": 4.501168211835746, + "grad_norm": 0.18979387273643028, + "learning_rate": 9.623520463524725e-06, + "loss": 0.2368, + "num_tokens": 2515582299.0, + "step": 3299 + }, + { + "epoch": 4.502533792794427, + "grad_norm": 0.203169628640453, + "learning_rate": 9.618608280915534e-06, + "loss": 0.2224, + "num_tokens": 2516262855.0, + "step": 3300 + }, + { + "epoch": 4.503899373753107, + "grad_norm": 0.19228877407098788, + "learning_rate": 9.613696519709137e-06, + "loss": 0.2326, + "num_tokens": 2517010244.0, + "step": 3301 + }, + { + "epoch": 4.505264954711787, + "grad_norm": 0.20171852543180552, + "learning_rate": 9.608785181403892e-06, + "loss": 0.2461, + "num_tokens": 2517795363.0, + "step": 3302 + }, + { + "epoch": 4.506630535670468, + "grad_norm": 0.20444488568503827, + "learning_rate": 9.603874267498043e-06, + "loss": 0.2378, + "num_tokens": 2518603689.0, + "step": 3303 + }, + { + "epoch": 4.5079961166291485, + "grad_norm": 0.2206896085005018, + "learning_rate": 9.598963779489693e-06, + "loss": 0.2359, + "num_tokens": 2519352132.0, + "step": 3304 + }, + { + "epoch": 4.5093616975878295, + "grad_norm": 0.19553770895643183, + "learning_rate": 9.594053718876817e-06, + "loss": 0.2482, + "num_tokens": 2520187666.0, + "step": 3305 + }, + { + "epoch": 4.51072727854651, + "grad_norm": 0.21334814037540908, + "learning_rate": 9.589144087157268e-06, + "loss": 0.2401, + "num_tokens": 2520939976.0, + "step": 3306 + }, + { + "epoch": 4.512092859505191, + "grad_norm": 0.21407323205075415, + "learning_rate": 9.584234885828759e-06, + "loss": 0.247, + "num_tokens": 2521724011.0, + "step": 3307 + }, + { + "epoch": 4.513458440463871, + "grad_norm": 0.21311862449104063, + "learning_rate": 9.579326116388874e-06, + "loss": 0.2265, + "num_tokens": 2522455080.0, + "step": 3308 + }, + { + "epoch": 4.514824021422552, + "grad_norm": 0.21249574053708586, + "learning_rate": 9.574417780335065e-06, + "loss": 0.2394, + "num_tokens": 2523213036.0, + "step": 3309 + }, + { + "epoch": 4.516189602381232, + "grad_norm": 0.1902922734441824, + "learning_rate": 9.569509879164658e-06, + "loss": 0.2386, + "num_tokens": 2524100233.0, + "step": 3310 + }, + { + "epoch": 4.517555183339912, + "grad_norm": 0.19500596371853096, + "learning_rate": 9.564602414374837e-06, + "loss": 0.2335, + "num_tokens": 2524830608.0, + "step": 3311 + }, + { + "epoch": 4.518920764298593, + "grad_norm": 0.23002574066241488, + "learning_rate": 9.559695387462657e-06, + "loss": 0.2398, + "num_tokens": 2525507673.0, + "step": 3312 + }, + { + "epoch": 4.520286345257273, + "grad_norm": 0.22684171617605037, + "learning_rate": 9.554788799925041e-06, + "loss": 0.2368, + "num_tokens": 2526187291.0, + "step": 3313 + }, + { + "epoch": 4.521651926215954, + "grad_norm": 0.20238387073527853, + "learning_rate": 9.54988265325877e-06, + "loss": 0.239, + "num_tokens": 2526971354.0, + "step": 3314 + }, + { + "epoch": 4.523017507174634, + "grad_norm": 0.21117097467299686, + "learning_rate": 9.54497694896051e-06, + "loss": 0.24, + "num_tokens": 2527729679.0, + "step": 3315 + }, + { + "epoch": 4.524383088133315, + "grad_norm": 0.21061796886520545, + "learning_rate": 9.540071688526765e-06, + "loss": 0.2355, + "num_tokens": 2528525732.0, + "step": 3316 + }, + { + "epoch": 4.525748669091995, + "grad_norm": 0.1998231650445214, + "learning_rate": 9.535166873453928e-06, + "loss": 0.2441, + "num_tokens": 2529330692.0, + "step": 3317 + }, + { + "epoch": 4.527114250050676, + "grad_norm": 0.20821820560054866, + "learning_rate": 9.53026250523824e-06, + "loss": 0.2327, + "num_tokens": 2530092824.0, + "step": 3318 + }, + { + "epoch": 4.528479831009356, + "grad_norm": 0.1974608459233468, + "learning_rate": 9.52535858537581e-06, + "loss": 0.2407, + "num_tokens": 2530860365.0, + "step": 3319 + }, + { + "epoch": 4.5298454119680365, + "grad_norm": 0.21264817780063427, + "learning_rate": 9.52045511536262e-06, + "loss": 0.2396, + "num_tokens": 2531559643.0, + "step": 3320 + }, + { + "epoch": 4.5312109929267175, + "grad_norm": 0.2032095804761357, + "learning_rate": 9.515552096694496e-06, + "loss": 0.2438, + "num_tokens": 2532359462.0, + "step": 3321 + }, + { + "epoch": 4.532576573885398, + "grad_norm": 0.206847131791177, + "learning_rate": 9.510649530867141e-06, + "loss": 0.2449, + "num_tokens": 2533120591.0, + "step": 3322 + }, + { + "epoch": 4.533942154844079, + "grad_norm": 0.1961256072640121, + "learning_rate": 9.505747419376117e-06, + "loss": 0.2397, + "num_tokens": 2533901847.0, + "step": 3323 + }, + { + "epoch": 4.535307735802759, + "grad_norm": 0.21022383561983476, + "learning_rate": 9.500845763716846e-06, + "loss": 0.2447, + "num_tokens": 2534727619.0, + "step": 3324 + }, + { + "epoch": 4.53667331676144, + "grad_norm": 0.18386993217022993, + "learning_rate": 9.495944565384608e-06, + "loss": 0.2345, + "num_tokens": 2535548012.0, + "step": 3325 + }, + { + "epoch": 4.53803889772012, + "grad_norm": 0.20480738204000826, + "learning_rate": 9.49104382587455e-06, + "loss": 0.2422, + "num_tokens": 2536296194.0, + "step": 3326 + }, + { + "epoch": 4.539404478678801, + "grad_norm": 0.2122268393634356, + "learning_rate": 9.486143546681673e-06, + "loss": 0.2332, + "num_tokens": 2536967574.0, + "step": 3327 + }, + { + "epoch": 4.540770059637481, + "grad_norm": 0.20121894239512383, + "learning_rate": 9.48124372930084e-06, + "loss": 0.2272, + "num_tokens": 2537750449.0, + "step": 3328 + }, + { + "epoch": 4.542135640596161, + "grad_norm": 0.23667169647472877, + "learning_rate": 9.476344375226774e-06, + "loss": 0.2477, + "num_tokens": 2538525532.0, + "step": 3329 + }, + { + "epoch": 4.543501221554842, + "grad_norm": 0.21366061234562636, + "learning_rate": 9.471445485954052e-06, + "loss": 0.2325, + "num_tokens": 2539307832.0, + "step": 3330 + }, + { + "epoch": 4.544866802513522, + "grad_norm": 0.19815201883866812, + "learning_rate": 9.46654706297712e-06, + "loss": 0.236, + "num_tokens": 2540070224.0, + "step": 3331 + }, + { + "epoch": 4.546232383472203, + "grad_norm": 0.20740861749296388, + "learning_rate": 9.461649107790267e-06, + "loss": 0.2455, + "num_tokens": 2540806333.0, + "step": 3332 + }, + { + "epoch": 4.547597964430883, + "grad_norm": 0.2170177498309866, + "learning_rate": 9.456751621887655e-06, + "loss": 0.2458, + "num_tokens": 2541550248.0, + "step": 3333 + }, + { + "epoch": 4.548963545389564, + "grad_norm": 0.19872950799517738, + "learning_rate": 9.45185460676329e-06, + "loss": 0.232, + "num_tokens": 2542252341.0, + "step": 3334 + }, + { + "epoch": 4.550329126348244, + "grad_norm": 0.20275107493927083, + "learning_rate": 9.446958063911038e-06, + "loss": 0.247, + "num_tokens": 2543048499.0, + "step": 3335 + }, + { + "epoch": 4.551694707306925, + "grad_norm": 0.1946748898295872, + "learning_rate": 9.442061994824625e-06, + "loss": 0.2414, + "num_tokens": 2543837208.0, + "step": 3336 + }, + { + "epoch": 4.5530602882656055, + "grad_norm": 0.21829189812186905, + "learning_rate": 9.437166400997629e-06, + "loss": 0.2373, + "num_tokens": 2544468680.0, + "step": 3337 + }, + { + "epoch": 4.554425869224286, + "grad_norm": 0.20345717987795855, + "learning_rate": 9.432271283923484e-06, + "loss": 0.2475, + "num_tokens": 2545261263.0, + "step": 3338 + }, + { + "epoch": 4.555791450182967, + "grad_norm": 0.2092678135699612, + "learning_rate": 9.427376645095475e-06, + "loss": 0.2497, + "num_tokens": 2545984283.0, + "step": 3339 + }, + { + "epoch": 4.557157031141647, + "grad_norm": 0.20776645458032952, + "learning_rate": 9.422482486006752e-06, + "loss": 0.2341, + "num_tokens": 2546719893.0, + "step": 3340 + }, + { + "epoch": 4.558522612100328, + "grad_norm": 0.19737905986640622, + "learning_rate": 9.4175888081503e-06, + "loss": 0.247, + "num_tokens": 2547469000.0, + "step": 3341 + }, + { + "epoch": 4.559888193059008, + "grad_norm": 0.20483525274851663, + "learning_rate": 9.41269561301898e-06, + "loss": 0.2399, + "num_tokens": 2548285537.0, + "step": 3342 + }, + { + "epoch": 4.561253774017689, + "grad_norm": 0.20461011556759764, + "learning_rate": 9.407802902105485e-06, + "loss": 0.2435, + "num_tokens": 2549126393.0, + "step": 3343 + }, + { + "epoch": 4.562619354976369, + "grad_norm": 0.19842686371041507, + "learning_rate": 9.402910676902373e-06, + "loss": 0.239, + "num_tokens": 2549971245.0, + "step": 3344 + }, + { + "epoch": 4.56398493593505, + "grad_norm": 0.20765518380211656, + "learning_rate": 9.398018938902049e-06, + "loss": 0.2342, + "num_tokens": 2550709560.0, + "step": 3345 + }, + { + "epoch": 4.56535051689373, + "grad_norm": 0.19830613044206608, + "learning_rate": 9.39312768959677e-06, + "loss": 0.244, + "num_tokens": 2551515851.0, + "step": 3346 + }, + { + "epoch": 4.56671609785241, + "grad_norm": 0.21071818175577023, + "learning_rate": 9.38823693047865e-06, + "loss": 0.2449, + "num_tokens": 2552266174.0, + "step": 3347 + }, + { + "epoch": 4.568081678811091, + "grad_norm": 0.22239948383968525, + "learning_rate": 9.383346663039637e-06, + "loss": 0.2406, + "num_tokens": 2552988256.0, + "step": 3348 + }, + { + "epoch": 4.569447259769771, + "grad_norm": 0.20886555791366224, + "learning_rate": 9.378456888771549e-06, + "loss": 0.2559, + "num_tokens": 2553789787.0, + "step": 3349 + }, + { + "epoch": 4.570812840728452, + "grad_norm": 0.19404738148780581, + "learning_rate": 9.373567609166044e-06, + "loss": 0.2347, + "num_tokens": 2554535598.0, + "step": 3350 + }, + { + "epoch": 4.572178421687132, + "grad_norm": 0.20752138244949273, + "learning_rate": 9.36867882571462e-06, + "loss": 0.2325, + "num_tokens": 2555277493.0, + "step": 3351 + }, + { + "epoch": 4.573544002645813, + "grad_norm": 0.19723591648963062, + "learning_rate": 9.363790539908644e-06, + "loss": 0.2482, + "num_tokens": 2556037836.0, + "step": 3352 + }, + { + "epoch": 4.5749095836044935, + "grad_norm": 0.20559341530574757, + "learning_rate": 9.358902753239312e-06, + "loss": 0.2372, + "num_tokens": 2556854768.0, + "step": 3353 + }, + { + "epoch": 4.5762751645631745, + "grad_norm": 0.19552347690470454, + "learning_rate": 9.354015467197687e-06, + "loss": 0.2329, + "num_tokens": 2557586391.0, + "step": 3354 + }, + { + "epoch": 4.577640745521855, + "grad_norm": 0.19827524158777876, + "learning_rate": 9.349128683274655e-06, + "loss": 0.2297, + "num_tokens": 2558331440.0, + "step": 3355 + }, + { + "epoch": 4.579006326480535, + "grad_norm": 0.1966743404356633, + "learning_rate": 9.344242402960972e-06, + "loss": 0.2441, + "num_tokens": 2559170428.0, + "step": 3356 + }, + { + "epoch": 4.580371907439216, + "grad_norm": 0.25065887768208667, + "learning_rate": 9.339356627747227e-06, + "loss": 0.2437, + "num_tokens": 2559888014.0, + "step": 3357 + }, + { + "epoch": 4.581737488397896, + "grad_norm": 0.20265510369293868, + "learning_rate": 9.334471359123856e-06, + "loss": 0.2281, + "num_tokens": 2560626387.0, + "step": 3358 + }, + { + "epoch": 4.583103069356577, + "grad_norm": 0.2017837716915504, + "learning_rate": 9.329586598581148e-06, + "loss": 0.2373, + "num_tokens": 2561300602.0, + "step": 3359 + }, + { + "epoch": 4.584468650315257, + "grad_norm": 0.20504440560350906, + "learning_rate": 9.324702347609228e-06, + "loss": 0.2398, + "num_tokens": 2562128829.0, + "step": 3360 + }, + { + "epoch": 4.585834231273938, + "grad_norm": 0.22478214511434785, + "learning_rate": 9.31981860769807e-06, + "loss": 0.2383, + "num_tokens": 2562827276.0, + "step": 3361 + }, + { + "epoch": 4.587199812232618, + "grad_norm": 0.21292453884830434, + "learning_rate": 9.314935380337494e-06, + "loss": 0.2453, + "num_tokens": 2563572334.0, + "step": 3362 + }, + { + "epoch": 4.588565393191299, + "grad_norm": 0.21732526611448022, + "learning_rate": 9.31005266701716e-06, + "loss": 0.2479, + "num_tokens": 2564385427.0, + "step": 3363 + }, + { + "epoch": 4.589930974149979, + "grad_norm": 0.20593138924171606, + "learning_rate": 9.305170469226568e-06, + "loss": 0.2426, + "num_tokens": 2565140180.0, + "step": 3364 + }, + { + "epoch": 4.591296555108659, + "grad_norm": 0.20879369659856262, + "learning_rate": 9.300288788455074e-06, + "loss": 0.2427, + "num_tokens": 2565894902.0, + "step": 3365 + }, + { + "epoch": 4.59266213606734, + "grad_norm": 0.21735351990921906, + "learning_rate": 9.295407626191861e-06, + "loss": 0.2441, + "num_tokens": 2566706943.0, + "step": 3366 + }, + { + "epoch": 4.59402771702602, + "grad_norm": 0.20813275971264644, + "learning_rate": 9.290526983925958e-06, + "loss": 0.2353, + "num_tokens": 2567496426.0, + "step": 3367 + }, + { + "epoch": 4.595393297984701, + "grad_norm": 0.2007255065621032, + "learning_rate": 9.285646863146247e-06, + "loss": 0.2406, + "num_tokens": 2568301001.0, + "step": 3368 + }, + { + "epoch": 4.5967588789433815, + "grad_norm": 0.20745305367779662, + "learning_rate": 9.28076726534143e-06, + "loss": 0.2436, + "num_tokens": 2569107954.0, + "step": 3369 + }, + { + "epoch": 4.5981244599020625, + "grad_norm": 0.2111653509822394, + "learning_rate": 9.275888192000071e-06, + "loss": 0.2377, + "num_tokens": 2569814167.0, + "step": 3370 + }, + { + "epoch": 4.599490040860743, + "grad_norm": 0.20382025044139543, + "learning_rate": 9.271009644610558e-06, + "loss": 0.2421, + "num_tokens": 2570570103.0, + "step": 3371 + }, + { + "epoch": 4.600855621819424, + "grad_norm": 0.2219712046819433, + "learning_rate": 9.266131624661127e-06, + "loss": 0.2499, + "num_tokens": 2571375071.0, + "step": 3372 + }, + { + "epoch": 4.602221202778104, + "grad_norm": 0.21584447497420187, + "learning_rate": 9.26125413363985e-06, + "loss": 0.2397, + "num_tokens": 2572210349.0, + "step": 3373 + }, + { + "epoch": 4.603586783736784, + "grad_norm": 0.19136172650251956, + "learning_rate": 9.256377173034637e-06, + "loss": 0.2394, + "num_tokens": 2573010038.0, + "step": 3374 + }, + { + "epoch": 4.604952364695465, + "grad_norm": 0.1980384169948712, + "learning_rate": 9.25150074433324e-06, + "loss": 0.2332, + "num_tokens": 2573771652.0, + "step": 3375 + }, + { + "epoch": 4.606317945654145, + "grad_norm": 0.21258289272872805, + "learning_rate": 9.246624849023242e-06, + "loss": 0.2437, + "num_tokens": 2574514024.0, + "step": 3376 + }, + { + "epoch": 4.607683526612826, + "grad_norm": 0.18987545444026027, + "learning_rate": 9.241749488592076e-06, + "loss": 0.2365, + "num_tokens": 2575260299.0, + "step": 3377 + }, + { + "epoch": 4.609049107571506, + "grad_norm": 0.20475513093298378, + "learning_rate": 9.236874664526988e-06, + "loss": 0.2414, + "num_tokens": 2576056104.0, + "step": 3378 + }, + { + "epoch": 4.610414688530187, + "grad_norm": 0.2186401168713881, + "learning_rate": 9.232000378315093e-06, + "loss": 0.2454, + "num_tokens": 2576795730.0, + "step": 3379 + }, + { + "epoch": 4.611780269488867, + "grad_norm": 0.2038955986182268, + "learning_rate": 9.227126631443316e-06, + "loss": 0.234, + "num_tokens": 2577525944.0, + "step": 3380 + }, + { + "epoch": 4.613145850447548, + "grad_norm": 0.21052793516269408, + "learning_rate": 9.222253425398426e-06, + "loss": 0.2311, + "num_tokens": 2578180828.0, + "step": 3381 + }, + { + "epoch": 4.614511431406228, + "grad_norm": 0.1918027864725675, + "learning_rate": 9.217380761667032e-06, + "loss": 0.216, + "num_tokens": 2578856727.0, + "step": 3382 + }, + { + "epoch": 4.615877012364908, + "grad_norm": 0.20523875165445205, + "learning_rate": 9.212508641735565e-06, + "loss": 0.2393, + "num_tokens": 2579600204.0, + "step": 3383 + }, + { + "epoch": 4.617242593323589, + "grad_norm": 0.1928714075772, + "learning_rate": 9.207637067090303e-06, + "loss": 0.2357, + "num_tokens": 2580358411.0, + "step": 3384 + }, + { + "epoch": 4.6186081742822696, + "grad_norm": 0.19573140085629262, + "learning_rate": 9.202766039217347e-06, + "loss": 0.2344, + "num_tokens": 2581186137.0, + "step": 3385 + }, + { + "epoch": 4.6199737552409506, + "grad_norm": 0.18818011183286545, + "learning_rate": 9.197895559602647e-06, + "loss": 0.2432, + "num_tokens": 2581968330.0, + "step": 3386 + }, + { + "epoch": 4.621339336199631, + "grad_norm": 0.20643431473450977, + "learning_rate": 9.193025629731964e-06, + "loss": 0.2409, + "num_tokens": 2582745545.0, + "step": 3387 + }, + { + "epoch": 4.622704917158312, + "grad_norm": 0.2101710158517749, + "learning_rate": 9.188156251090912e-06, + "loss": 0.2397, + "num_tokens": 2583482951.0, + "step": 3388 + }, + { + "epoch": 4.624070498116992, + "grad_norm": 0.19841672810635316, + "learning_rate": 9.183287425164923e-06, + "loss": 0.2361, + "num_tokens": 2584221626.0, + "step": 3389 + }, + { + "epoch": 4.625436079075673, + "grad_norm": 0.22442360568315275, + "learning_rate": 9.178419153439265e-06, + "loss": 0.2412, + "num_tokens": 2585020015.0, + "step": 3390 + }, + { + "epoch": 4.626801660034353, + "grad_norm": 0.19284497189387276, + "learning_rate": 9.173551437399038e-06, + "loss": 0.2417, + "num_tokens": 2585814417.0, + "step": 3391 + }, + { + "epoch": 4.628167240993033, + "grad_norm": 0.2043403700587299, + "learning_rate": 9.16868427852917e-06, + "loss": 0.2396, + "num_tokens": 2586625349.0, + "step": 3392 + }, + { + "epoch": 4.629532821951714, + "grad_norm": 0.2702723684086294, + "learning_rate": 9.16381767831442e-06, + "loss": 0.2465, + "num_tokens": 2587410176.0, + "step": 3393 + }, + { + "epoch": 4.630898402910394, + "grad_norm": 0.2008214337292333, + "learning_rate": 9.158951638239377e-06, + "loss": 0.2333, + "num_tokens": 2588142873.0, + "step": 3394 + }, + { + "epoch": 4.632263983869075, + "grad_norm": 0.2130384556909265, + "learning_rate": 9.154086159788462e-06, + "loss": 0.2388, + "num_tokens": 2588841395.0, + "step": 3395 + }, + { + "epoch": 4.633629564827755, + "grad_norm": 0.1927846463939635, + "learning_rate": 9.14922124444592e-06, + "loss": 0.2387, + "num_tokens": 2589591040.0, + "step": 3396 + }, + { + "epoch": 4.634995145786436, + "grad_norm": 0.1863155373662412, + "learning_rate": 9.144356893695828e-06, + "loss": 0.2278, + "num_tokens": 2590353775.0, + "step": 3397 + }, + { + "epoch": 4.636360726745116, + "grad_norm": 0.22174885153519772, + "learning_rate": 9.139493109022084e-06, + "loss": 0.2399, + "num_tokens": 2591175150.0, + "step": 3398 + }, + { + "epoch": 4.637726307703797, + "grad_norm": 0.19642442490557024, + "learning_rate": 9.134629891908419e-06, + "loss": 0.2369, + "num_tokens": 2591933313.0, + "step": 3399 + }, + { + "epoch": 4.6390918886624775, + "grad_norm": 0.19415599009522339, + "learning_rate": 9.129767243838397e-06, + "loss": 0.2467, + "num_tokens": 2592706622.0, + "step": 3400 + }, + { + "epoch": 4.640457469621158, + "grad_norm": 0.2097545898528867, + "learning_rate": 9.12490516629539e-06, + "loss": 0.2501, + "num_tokens": 2593506766.0, + "step": 3401 + }, + { + "epoch": 4.641823050579839, + "grad_norm": 0.22033954052570268, + "learning_rate": 9.120043660762618e-06, + "loss": 0.2389, + "num_tokens": 2594192449.0, + "step": 3402 + }, + { + "epoch": 4.643188631538519, + "grad_norm": 0.21173903316684015, + "learning_rate": 9.115182728723107e-06, + "loss": 0.2569, + "num_tokens": 2595070814.0, + "step": 3403 + }, + { + "epoch": 4.6445542124972, + "grad_norm": 0.1992122425682799, + "learning_rate": 9.110322371659724e-06, + "loss": 0.2402, + "num_tokens": 2595865510.0, + "step": 3404 + }, + { + "epoch": 4.64591979345588, + "grad_norm": 0.2183899045887962, + "learning_rate": 9.105462591055153e-06, + "loss": 0.2351, + "num_tokens": 2596615035.0, + "step": 3405 + }, + { + "epoch": 4.647285374414561, + "grad_norm": 0.20091140248998032, + "learning_rate": 9.100603388391893e-06, + "loss": 0.2307, + "num_tokens": 2597330167.0, + "step": 3406 + }, + { + "epoch": 4.648650955373241, + "grad_norm": 0.21109964597808042, + "learning_rate": 9.095744765152289e-06, + "loss": 0.2405, + "num_tokens": 2598059193.0, + "step": 3407 + }, + { + "epoch": 4.650016536331922, + "grad_norm": 0.21463726523173413, + "learning_rate": 9.090886722818487e-06, + "loss": 0.2502, + "num_tokens": 2598818097.0, + "step": 3408 + }, + { + "epoch": 4.651382117290602, + "grad_norm": 0.19615362984320264, + "learning_rate": 9.086029262872474e-06, + "loss": 0.2415, + "num_tokens": 2599628169.0, + "step": 3409 + }, + { + "epoch": 4.652747698249282, + "grad_norm": 0.2087793222332414, + "learning_rate": 9.081172386796042e-06, + "loss": 0.2322, + "num_tokens": 2600280794.0, + "step": 3410 + }, + { + "epoch": 4.654113279207963, + "grad_norm": 0.20867051152430605, + "learning_rate": 9.07631609607082e-06, + "loss": 0.2393, + "num_tokens": 2601052424.0, + "step": 3411 + }, + { + "epoch": 4.655478860166643, + "grad_norm": 0.19221732381700896, + "learning_rate": 9.071460392178249e-06, + "loss": 0.2307, + "num_tokens": 2601859565.0, + "step": 3412 + }, + { + "epoch": 4.656844441125324, + "grad_norm": 0.20093532971234912, + "learning_rate": 9.066605276599587e-06, + "loss": 0.2345, + "num_tokens": 2602585349.0, + "step": 3413 + }, + { + "epoch": 4.658210022084004, + "grad_norm": 0.21414510786184207, + "learning_rate": 9.061750750815937e-06, + "loss": 0.2316, + "num_tokens": 2603312276.0, + "step": 3414 + }, + { + "epoch": 4.659575603042685, + "grad_norm": 0.22765466836249135, + "learning_rate": 9.05689681630819e-06, + "loss": 0.2471, + "num_tokens": 2604069856.0, + "step": 3415 + }, + { + "epoch": 4.6609411840013655, + "grad_norm": 0.20959830526947132, + "learning_rate": 9.052043474557075e-06, + "loss": 0.2475, + "num_tokens": 2604817797.0, + "step": 3416 + }, + { + "epoch": 4.6623067649600465, + "grad_norm": 0.19917499229406238, + "learning_rate": 9.047190727043139e-06, + "loss": 0.2397, + "num_tokens": 2605619999.0, + "step": 3417 + }, + { + "epoch": 4.663672345918727, + "grad_norm": 0.20046073976503984, + "learning_rate": 9.042338575246743e-06, + "loss": 0.2325, + "num_tokens": 2606353413.0, + "step": 3418 + }, + { + "epoch": 4.665037926877407, + "grad_norm": 0.19794054127933924, + "learning_rate": 9.037487020648069e-06, + "loss": 0.2433, + "num_tokens": 2607151533.0, + "step": 3419 + }, + { + "epoch": 4.666403507836088, + "grad_norm": 0.1915607384046705, + "learning_rate": 9.032636064727114e-06, + "loss": 0.2365, + "num_tokens": 2607906933.0, + "step": 3420 + }, + { + "epoch": 4.667769088794768, + "grad_norm": 0.20413457413681754, + "learning_rate": 9.0277857089637e-06, + "loss": 0.2381, + "num_tokens": 2608675546.0, + "step": 3421 + }, + { + "epoch": 4.669134669753449, + "grad_norm": 0.1863915118290176, + "learning_rate": 9.02293595483745e-06, + "loss": 0.2392, + "num_tokens": 2609520908.0, + "step": 3422 + }, + { + "epoch": 4.670500250712129, + "grad_norm": 0.2032791408809255, + "learning_rate": 9.018086803827825e-06, + "loss": 0.2361, + "num_tokens": 2610236046.0, + "step": 3423 + }, + { + "epoch": 4.67186583167081, + "grad_norm": 0.20673513227509388, + "learning_rate": 9.013238257414088e-06, + "loss": 0.244, + "num_tokens": 2610994910.0, + "step": 3424 + }, + { + "epoch": 4.67323141262949, + "grad_norm": 0.19885318088084145, + "learning_rate": 9.008390317075318e-06, + "loss": 0.2289, + "num_tokens": 2611725688.0, + "step": 3425 + }, + { + "epoch": 4.674596993588171, + "grad_norm": 0.1918076829127812, + "learning_rate": 9.003542984290412e-06, + "loss": 0.2447, + "num_tokens": 2612546821.0, + "step": 3426 + }, + { + "epoch": 4.675962574546851, + "grad_norm": 0.19894733350992833, + "learning_rate": 8.998696260538085e-06, + "loss": 0.2467, + "num_tokens": 2613366501.0, + "step": 3427 + }, + { + "epoch": 4.677328155505531, + "grad_norm": 0.19085159155854678, + "learning_rate": 8.993850147296858e-06, + "loss": 0.2305, + "num_tokens": 2614117450.0, + "step": 3428 + }, + { + "epoch": 4.678693736464212, + "grad_norm": 0.2064451831799567, + "learning_rate": 8.989004646045072e-06, + "loss": 0.2466, + "num_tokens": 2614974965.0, + "step": 3429 + }, + { + "epoch": 4.680059317422892, + "grad_norm": 0.19476204598330513, + "learning_rate": 8.984159758260881e-06, + "loss": 0.2483, + "num_tokens": 2615787674.0, + "step": 3430 + }, + { + "epoch": 4.681424898381573, + "grad_norm": 0.20123243475213579, + "learning_rate": 8.979315485422246e-06, + "loss": 0.2329, + "num_tokens": 2616555299.0, + "step": 3431 + }, + { + "epoch": 4.6827904793402535, + "grad_norm": 0.19874328660448434, + "learning_rate": 8.97447182900695e-06, + "loss": 0.244, + "num_tokens": 2617320667.0, + "step": 3432 + }, + { + "epoch": 4.6841560602989345, + "grad_norm": 0.20427365755115598, + "learning_rate": 8.969628790492581e-06, + "loss": 0.2431, + "num_tokens": 2618049419.0, + "step": 3433 + }, + { + "epoch": 4.685521641257615, + "grad_norm": 0.1941240644124144, + "learning_rate": 8.964786371356541e-06, + "loss": 0.2447, + "num_tokens": 2618887091.0, + "step": 3434 + }, + { + "epoch": 4.686887222216296, + "grad_norm": 0.22380448327526603, + "learning_rate": 8.95994457307604e-06, + "loss": 0.2354, + "num_tokens": 2619647365.0, + "step": 3435 + }, + { + "epoch": 4.688252803174976, + "grad_norm": 0.1937653605399362, + "learning_rate": 8.955103397128104e-06, + "loss": 0.2329, + "num_tokens": 2620471031.0, + "step": 3436 + }, + { + "epoch": 4.689618384133656, + "grad_norm": 0.20201791530116348, + "learning_rate": 8.950262844989566e-06, + "loss": 0.2407, + "num_tokens": 2621228066.0, + "step": 3437 + }, + { + "epoch": 4.690983965092337, + "grad_norm": 0.19820667785130155, + "learning_rate": 8.945422918137063e-06, + "loss": 0.2455, + "num_tokens": 2622012784.0, + "step": 3438 + }, + { + "epoch": 4.692349546051017, + "grad_norm": 0.21931220453542963, + "learning_rate": 8.940583618047054e-06, + "loss": 0.2479, + "num_tokens": 2622753444.0, + "step": 3439 + }, + { + "epoch": 4.693715127009698, + "grad_norm": 0.21600074264636046, + "learning_rate": 8.935744946195794e-06, + "loss": 0.2474, + "num_tokens": 2623531027.0, + "step": 3440 + }, + { + "epoch": 4.695080707968378, + "grad_norm": 0.20162729571272842, + "learning_rate": 8.93090690405936e-06, + "loss": 0.234, + "num_tokens": 2624247871.0, + "step": 3441 + }, + { + "epoch": 4.696446288927059, + "grad_norm": 0.2155168583520979, + "learning_rate": 8.926069493113623e-06, + "loss": 0.2414, + "num_tokens": 2624938296.0, + "step": 3442 + }, + { + "epoch": 4.697811869885739, + "grad_norm": 0.1978173184758825, + "learning_rate": 8.92123271483427e-06, + "loss": 0.238, + "num_tokens": 2625731718.0, + "step": 3443 + }, + { + "epoch": 4.69917745084442, + "grad_norm": 0.21737327421965463, + "learning_rate": 8.916396570696792e-06, + "loss": 0.2487, + "num_tokens": 2626510612.0, + "step": 3444 + }, + { + "epoch": 4.7005430318031, + "grad_norm": 0.2251280320721177, + "learning_rate": 8.911561062176489e-06, + "loss": 0.2384, + "num_tokens": 2627311777.0, + "step": 3445 + }, + { + "epoch": 4.70190861276178, + "grad_norm": 0.20631562841274861, + "learning_rate": 8.90672619074846e-06, + "loss": 0.2361, + "num_tokens": 2628058491.0, + "step": 3446 + }, + { + "epoch": 4.703274193720461, + "grad_norm": 0.218136447843788, + "learning_rate": 8.90189195788762e-06, + "loss": 0.2384, + "num_tokens": 2628888563.0, + "step": 3447 + }, + { + "epoch": 4.7046397746791415, + "grad_norm": 0.2176447572319751, + "learning_rate": 8.897058365068681e-06, + "loss": 0.241, + "num_tokens": 2629637704.0, + "step": 3448 + }, + { + "epoch": 4.7060053556378225, + "grad_norm": 0.20738997901789635, + "learning_rate": 8.892225413766162e-06, + "loss": 0.2368, + "num_tokens": 2630385190.0, + "step": 3449 + }, + { + "epoch": 4.707370936596503, + "grad_norm": 0.20091440852386375, + "learning_rate": 8.887393105454392e-06, + "loss": 0.2512, + "num_tokens": 2631152963.0, + "step": 3450 + }, + { + "epoch": 4.708736517555184, + "grad_norm": 0.2098186171409711, + "learning_rate": 8.882561441607498e-06, + "loss": 0.2344, + "num_tokens": 2631867937.0, + "step": 3451 + }, + { + "epoch": 4.710102098513864, + "grad_norm": 0.21236019639692066, + "learning_rate": 8.877730423699407e-06, + "loss": 0.2347, + "num_tokens": 2632602049.0, + "step": 3452 + }, + { + "epoch": 4.711467679472545, + "grad_norm": 0.19826985284042017, + "learning_rate": 8.872900053203858e-06, + "loss": 0.2433, + "num_tokens": 2633337686.0, + "step": 3453 + }, + { + "epoch": 4.712833260431225, + "grad_norm": 0.20938059333507097, + "learning_rate": 8.868070331594385e-06, + "loss": 0.2461, + "num_tokens": 2634062899.0, + "step": 3454 + }, + { + "epoch": 4.714198841389905, + "grad_norm": 0.2079664207094596, + "learning_rate": 8.863241260344328e-06, + "loss": 0.249, + "num_tokens": 2634824158.0, + "step": 3455 + }, + { + "epoch": 4.715564422348586, + "grad_norm": 0.19787334732117126, + "learning_rate": 8.858412840926825e-06, + "loss": 0.2444, + "num_tokens": 2635593791.0, + "step": 3456 + }, + { + "epoch": 4.716930003307266, + "grad_norm": 0.21821675059146553, + "learning_rate": 8.853585074814825e-06, + "loss": 0.2393, + "num_tokens": 2636338475.0, + "step": 3457 + }, + { + "epoch": 4.718295584265947, + "grad_norm": 0.20631237813921496, + "learning_rate": 8.848757963481058e-06, + "loss": 0.2386, + "num_tokens": 2637073299.0, + "step": 3458 + }, + { + "epoch": 4.719661165224627, + "grad_norm": 0.21133152322323226, + "learning_rate": 8.84393150839808e-06, + "loss": 0.2451, + "num_tokens": 2637795017.0, + "step": 3459 + }, + { + "epoch": 4.721026746183308, + "grad_norm": 0.2068733099127569, + "learning_rate": 8.839105711038226e-06, + "loss": 0.2279, + "num_tokens": 2638492681.0, + "step": 3460 + }, + { + "epoch": 4.722392327141988, + "grad_norm": 0.1986085107551285, + "learning_rate": 8.834280572873639e-06, + "loss": 0.2362, + "num_tokens": 2639232869.0, + "step": 3461 + }, + { + "epoch": 4.723757908100669, + "grad_norm": 0.2132653353142382, + "learning_rate": 8.829456095376261e-06, + "loss": 0.2308, + "num_tokens": 2639977426.0, + "step": 3462 + }, + { + "epoch": 4.725123489059349, + "grad_norm": 0.2117474932924526, + "learning_rate": 8.824632280017828e-06, + "loss": 0.2335, + "num_tokens": 2640760372.0, + "step": 3463 + }, + { + "epoch": 4.7264890700180295, + "grad_norm": 0.19181322583481006, + "learning_rate": 8.81980912826988e-06, + "loss": 0.2455, + "num_tokens": 2641560339.0, + "step": 3464 + }, + { + "epoch": 4.7278546509767105, + "grad_norm": 0.1962853955445266, + "learning_rate": 8.81498664160375e-06, + "loss": 0.2383, + "num_tokens": 2642327895.0, + "step": 3465 + }, + { + "epoch": 4.729220231935391, + "grad_norm": 0.22798853304647806, + "learning_rate": 8.810164821490575e-06, + "loss": 0.2477, + "num_tokens": 2643067545.0, + "step": 3466 + }, + { + "epoch": 4.730585812894072, + "grad_norm": 0.19114849585864122, + "learning_rate": 8.805343669401276e-06, + "loss": 0.2356, + "num_tokens": 2643870442.0, + "step": 3467 + }, + { + "epoch": 4.731951393852752, + "grad_norm": 0.20823156144900565, + "learning_rate": 8.80052318680658e-06, + "loss": 0.2431, + "num_tokens": 2644616278.0, + "step": 3468 + }, + { + "epoch": 4.733316974811433, + "grad_norm": 0.20139828798289083, + "learning_rate": 8.795703375177009e-06, + "loss": 0.2388, + "num_tokens": 2645417567.0, + "step": 3469 + }, + { + "epoch": 4.734682555770113, + "grad_norm": 0.21371512111461002, + "learning_rate": 8.790884235982878e-06, + "loss": 0.2302, + "num_tokens": 2646177617.0, + "step": 3470 + }, + { + "epoch": 4.736048136728794, + "grad_norm": 0.20167306565140672, + "learning_rate": 8.7860657706943e-06, + "loss": 0.2425, + "num_tokens": 2646940155.0, + "step": 3471 + }, + { + "epoch": 4.737413717687474, + "grad_norm": 0.20455965938513435, + "learning_rate": 8.781247980781176e-06, + "loss": 0.245, + "num_tokens": 2647764375.0, + "step": 3472 + }, + { + "epoch": 4.738779298646154, + "grad_norm": 0.19280606796453045, + "learning_rate": 8.776430867713207e-06, + "loss": 0.2292, + "num_tokens": 2648516705.0, + "step": 3473 + }, + { + "epoch": 4.740144879604835, + "grad_norm": 0.20303943832322682, + "learning_rate": 8.771614432959886e-06, + "loss": 0.2388, + "num_tokens": 2649260302.0, + "step": 3474 + }, + { + "epoch": 4.741510460563515, + "grad_norm": 0.19880908136277226, + "learning_rate": 8.766798677990502e-06, + "loss": 0.2337, + "num_tokens": 2650048652.0, + "step": 3475 + }, + { + "epoch": 4.742876041522196, + "grad_norm": 0.20671883239911407, + "learning_rate": 8.761983604274126e-06, + "loss": 0.2425, + "num_tokens": 2650786394.0, + "step": 3476 + }, + { + "epoch": 4.744241622480876, + "grad_norm": 0.2043984311605638, + "learning_rate": 8.75716921327963e-06, + "loss": 0.2526, + "num_tokens": 2651584884.0, + "step": 3477 + }, + { + "epoch": 4.745607203439557, + "grad_norm": 0.20216514799825924, + "learning_rate": 8.752355506475683e-06, + "loss": 0.2381, + "num_tokens": 2652368070.0, + "step": 3478 + }, + { + "epoch": 4.746972784398237, + "grad_norm": 0.25437274362499285, + "learning_rate": 8.747542485330731e-06, + "loss": 0.2393, + "num_tokens": 2653197918.0, + "step": 3479 + }, + { + "epoch": 4.7483383653569184, + "grad_norm": 0.20064138211692928, + "learning_rate": 8.742730151313021e-06, + "loss": 0.243, + "num_tokens": 2654018814.0, + "step": 3480 + }, + { + "epoch": 4.749703946315599, + "grad_norm": 0.20461619489911198, + "learning_rate": 8.737918505890588e-06, + "loss": 0.2321, + "num_tokens": 2654760579.0, + "step": 3481 + }, + { + "epoch": 4.751069527274279, + "grad_norm": 0.21920725106126696, + "learning_rate": 8.733107550531256e-06, + "loss": 0.2397, + "num_tokens": 2655530938.0, + "step": 3482 + }, + { + "epoch": 4.75243510823296, + "grad_norm": 0.20517291122721565, + "learning_rate": 8.728297286702642e-06, + "loss": 0.2482, + "num_tokens": 2656333173.0, + "step": 3483 + }, + { + "epoch": 4.75380068919164, + "grad_norm": 0.1968468560053242, + "learning_rate": 8.723487715872143e-06, + "loss": 0.2437, + "num_tokens": 2657124467.0, + "step": 3484 + }, + { + "epoch": 4.755166270150321, + "grad_norm": 0.21128441083480096, + "learning_rate": 8.71867883950696e-06, + "loss": 0.2333, + "num_tokens": 2657828303.0, + "step": 3485 + }, + { + "epoch": 4.756531851109001, + "grad_norm": 0.20050348497069168, + "learning_rate": 8.713870659074062e-06, + "loss": 0.2404, + "num_tokens": 2658563237.0, + "step": 3486 + }, + { + "epoch": 4.757897432067682, + "grad_norm": 0.20120414035053463, + "learning_rate": 8.709063176040224e-06, + "loss": 0.2265, + "num_tokens": 2659327453.0, + "step": 3487 + }, + { + "epoch": 4.759263013026362, + "grad_norm": 0.20862976932418367, + "learning_rate": 8.704256391871999e-06, + "loss": 0.2286, + "num_tokens": 2660005865.0, + "step": 3488 + }, + { + "epoch": 4.760628593985043, + "grad_norm": 0.3231363466815293, + "learning_rate": 8.699450308035729e-06, + "loss": 0.2427, + "num_tokens": 2660789628.0, + "step": 3489 + }, + { + "epoch": 4.761994174943723, + "grad_norm": 0.21728893392592669, + "learning_rate": 8.694644925997541e-06, + "loss": 0.2344, + "num_tokens": 2661476657.0, + "step": 3490 + }, + { + "epoch": 4.763359755902403, + "grad_norm": 0.20313297060154203, + "learning_rate": 8.689840247223351e-06, + "loss": 0.2395, + "num_tokens": 2662264009.0, + "step": 3491 + }, + { + "epoch": 4.764725336861084, + "grad_norm": 0.21048672623855538, + "learning_rate": 8.685036273178857e-06, + "loss": 0.2488, + "num_tokens": 2663131683.0, + "step": 3492 + }, + { + "epoch": 4.766090917819764, + "grad_norm": 0.20000869715531253, + "learning_rate": 8.680233005329542e-06, + "loss": 0.2431, + "num_tokens": 2664015817.0, + "step": 3493 + }, + { + "epoch": 4.767456498778445, + "grad_norm": 0.20033951619164836, + "learning_rate": 8.675430445140678e-06, + "loss": 0.2545, + "num_tokens": 2664863719.0, + "step": 3494 + }, + { + "epoch": 4.7688220797371255, + "grad_norm": 0.9254328335471963, + "learning_rate": 8.670628594077313e-06, + "loss": 0.2428, + "num_tokens": 2665668907.0, + "step": 3495 + }, + { + "epoch": 4.7701876606958065, + "grad_norm": 0.22107240968437783, + "learning_rate": 8.665827453604292e-06, + "loss": 0.2409, + "num_tokens": 2666494912.0, + "step": 3496 + }, + { + "epoch": 4.771553241654487, + "grad_norm": 0.19736163664672604, + "learning_rate": 8.661027025186228e-06, + "loss": 0.2433, + "num_tokens": 2667311062.0, + "step": 3497 + }, + { + "epoch": 4.772918822613168, + "grad_norm": 0.20382838603919087, + "learning_rate": 8.656227310287527e-06, + "loss": 0.2298, + "num_tokens": 2667997722.0, + "step": 3498 + }, + { + "epoch": 4.774284403571848, + "grad_norm": 0.20056292305269374, + "learning_rate": 8.651428310372374e-06, + "loss": 0.2364, + "num_tokens": 2668764375.0, + "step": 3499 + }, + { + "epoch": 4.775649984530528, + "grad_norm": 0.21059994852897473, + "learning_rate": 8.646630026904736e-06, + "loss": 0.241, + "num_tokens": 2669486381.0, + "step": 3500 + }, + { + "epoch": 4.777015565489209, + "grad_norm": 0.21137167137458931, + "learning_rate": 8.64183246134836e-06, + "loss": 0.2445, + "num_tokens": 2670286373.0, + "step": 3501 + }, + { + "epoch": 4.778381146447889, + "grad_norm": 0.19737520379374746, + "learning_rate": 8.637035615166773e-06, + "loss": 0.2424, + "num_tokens": 2671045700.0, + "step": 3502 + }, + { + "epoch": 4.77974672740657, + "grad_norm": 0.22951256286280264, + "learning_rate": 8.632239489823295e-06, + "loss": 0.2344, + "num_tokens": 2671735477.0, + "step": 3503 + }, + { + "epoch": 4.78111230836525, + "grad_norm": 0.20014640779101314, + "learning_rate": 8.627444086781006e-06, + "loss": 0.24, + "num_tokens": 2672527227.0, + "step": 3504 + }, + { + "epoch": 4.782477889323931, + "grad_norm": 0.20229721560225042, + "learning_rate": 8.62264940750278e-06, + "loss": 0.249, + "num_tokens": 2673358011.0, + "step": 3505 + }, + { + "epoch": 4.783843470282611, + "grad_norm": 0.2007001466116834, + "learning_rate": 8.617855453451266e-06, + "loss": 0.2453, + "num_tokens": 2674107022.0, + "step": 3506 + }, + { + "epoch": 4.785209051241292, + "grad_norm": 0.21770157105788737, + "learning_rate": 8.61306222608889e-06, + "loss": 0.2435, + "num_tokens": 2674869998.0, + "step": 3507 + }, + { + "epoch": 4.786574632199972, + "grad_norm": 0.21458519033443413, + "learning_rate": 8.608269726877863e-06, + "loss": 0.2417, + "num_tokens": 2675624791.0, + "step": 3508 + }, + { + "epoch": 4.787940213158652, + "grad_norm": 0.19794176612638098, + "learning_rate": 8.603477957280162e-06, + "loss": 0.2407, + "num_tokens": 2676405504.0, + "step": 3509 + }, + { + "epoch": 4.789305794117333, + "grad_norm": 0.19773039485223795, + "learning_rate": 8.598686918757552e-06, + "loss": 0.241, + "num_tokens": 2677178275.0, + "step": 3510 + }, + { + "epoch": 4.7906713750760135, + "grad_norm": 0.2094566051703006, + "learning_rate": 8.593896612771569e-06, + "loss": 0.2407, + "num_tokens": 2677910825.0, + "step": 3511 + }, + { + "epoch": 4.7920369560346945, + "grad_norm": 0.1984124236256028, + "learning_rate": 8.589107040783533e-06, + "loss": 0.2364, + "num_tokens": 2678684964.0, + "step": 3512 + }, + { + "epoch": 4.793402536993375, + "grad_norm": 0.1944116653235451, + "learning_rate": 8.584318204254532e-06, + "loss": 0.2272, + "num_tokens": 2679461280.0, + "step": 3513 + }, + { + "epoch": 4.794768117952056, + "grad_norm": 0.20563776076336865, + "learning_rate": 8.57953010464543e-06, + "loss": 0.2375, + "num_tokens": 2680276281.0, + "step": 3514 + }, + { + "epoch": 4.796133698910736, + "grad_norm": 0.1924070099939996, + "learning_rate": 8.574742743416875e-06, + "loss": 0.2359, + "num_tokens": 2681129655.0, + "step": 3515 + }, + { + "epoch": 4.797499279869417, + "grad_norm": 0.20258910851669598, + "learning_rate": 8.569956122029278e-06, + "loss": 0.2459, + "num_tokens": 2681932261.0, + "step": 3516 + }, + { + "epoch": 4.798864860828097, + "grad_norm": 0.20474501298699052, + "learning_rate": 8.565170241942832e-06, + "loss": 0.2443, + "num_tokens": 2682698141.0, + "step": 3517 + }, + { + "epoch": 4.800230441786777, + "grad_norm": 0.20804501001678166, + "learning_rate": 8.560385104617501e-06, + "loss": 0.2381, + "num_tokens": 2683447516.0, + "step": 3518 + }, + { + "epoch": 4.801596022745458, + "grad_norm": 0.1896594868273113, + "learning_rate": 8.555600711513027e-06, + "loss": 0.2368, + "num_tokens": 2684190675.0, + "step": 3519 + }, + { + "epoch": 4.802961603704138, + "grad_norm": 0.1943357116919205, + "learning_rate": 8.550817064088913e-06, + "loss": 0.2436, + "num_tokens": 2684943531.0, + "step": 3520 + }, + { + "epoch": 4.804327184662819, + "grad_norm": 0.21083486838821808, + "learning_rate": 8.546034163804451e-06, + "loss": 0.2358, + "num_tokens": 2685659176.0, + "step": 3521 + }, + { + "epoch": 4.805692765621499, + "grad_norm": 0.19338824713139363, + "learning_rate": 8.541252012118695e-06, + "loss": 0.2366, + "num_tokens": 2686413442.0, + "step": 3522 + }, + { + "epoch": 4.80705834658018, + "grad_norm": 0.21829309274680628, + "learning_rate": 8.536470610490465e-06, + "loss": 0.2562, + "num_tokens": 2687150164.0, + "step": 3523 + }, + { + "epoch": 4.80842392753886, + "grad_norm": 0.20185850505335706, + "learning_rate": 8.531689960378371e-06, + "loss": 0.2419, + "num_tokens": 2687866521.0, + "step": 3524 + }, + { + "epoch": 4.809789508497541, + "grad_norm": 0.19894912460188235, + "learning_rate": 8.526910063240773e-06, + "loss": 0.2354, + "num_tokens": 2688701906.0, + "step": 3525 + }, + { + "epoch": 4.811155089456221, + "grad_norm": 0.20785615628428386, + "learning_rate": 8.522130920535817e-06, + "loss": 0.2429, + "num_tokens": 2689404506.0, + "step": 3526 + }, + { + "epoch": 4.8125206704149015, + "grad_norm": 0.19360111389745663, + "learning_rate": 8.517352533721407e-06, + "loss": 0.2335, + "num_tokens": 2690207394.0, + "step": 3527 + }, + { + "epoch": 4.8138862513735825, + "grad_norm": 0.20959764231373693, + "learning_rate": 8.512574904255228e-06, + "loss": 0.2416, + "num_tokens": 2690948274.0, + "step": 3528 + }, + { + "epoch": 4.815251832332263, + "grad_norm": 0.192612350058698, + "learning_rate": 8.507798033594722e-06, + "loss": 0.2379, + "num_tokens": 2691757063.0, + "step": 3529 + }, + { + "epoch": 4.816617413290944, + "grad_norm": 0.21385397308351053, + "learning_rate": 8.503021923197109e-06, + "loss": 0.2374, + "num_tokens": 2692546647.0, + "step": 3530 + }, + { + "epoch": 4.817982994249624, + "grad_norm": 0.1813102533553526, + "learning_rate": 8.498246574519373e-06, + "loss": 0.2281, + "num_tokens": 2693296491.0, + "step": 3531 + }, + { + "epoch": 4.819348575208305, + "grad_norm": 0.21140851803142244, + "learning_rate": 8.493471989018261e-06, + "loss": 0.2508, + "num_tokens": 2694076947.0, + "step": 3532 + }, + { + "epoch": 4.820714156166985, + "grad_norm": 0.2142010496657102, + "learning_rate": 8.488698168150302e-06, + "loss": 0.2539, + "num_tokens": 2694795910.0, + "step": 3533 + }, + { + "epoch": 4.822079737125666, + "grad_norm": 0.19629091281595965, + "learning_rate": 8.483925113371772e-06, + "loss": 0.2498, + "num_tokens": 2695595776.0, + "step": 3534 + }, + { + "epoch": 4.823445318084346, + "grad_norm": 0.1881607352649016, + "learning_rate": 8.479152826138732e-06, + "loss": 0.2381, + "num_tokens": 2696356047.0, + "step": 3535 + }, + { + "epoch": 4.824810899043026, + "grad_norm": 0.19693070624729248, + "learning_rate": 8.474381307906994e-06, + "loss": 0.2301, + "num_tokens": 2697091620.0, + "step": 3536 + }, + { + "epoch": 4.826176480001707, + "grad_norm": 0.8502824034458002, + "learning_rate": 8.469610560132144e-06, + "loss": 0.2314, + "num_tokens": 2697788249.0, + "step": 3537 + }, + { + "epoch": 4.827542060960387, + "grad_norm": 0.199977166336443, + "learning_rate": 8.464840584269534e-06, + "loss": 0.2432, + "num_tokens": 2698629778.0, + "step": 3538 + }, + { + "epoch": 4.828907641919068, + "grad_norm": 0.19464268395374384, + "learning_rate": 8.460071381774266e-06, + "loss": 0.2335, + "num_tokens": 2699357028.0, + "step": 3539 + }, + { + "epoch": 4.830273222877748, + "grad_norm": 0.20397808092604017, + "learning_rate": 8.45530295410123e-06, + "loss": 0.2529, + "num_tokens": 2700151783.0, + "step": 3540 + }, + { + "epoch": 4.831638803836429, + "grad_norm": 0.21083679893768728, + "learning_rate": 8.450535302705058e-06, + "loss": 0.2423, + "num_tokens": 2700880652.0, + "step": 3541 + }, + { + "epoch": 4.833004384795109, + "grad_norm": 0.20809018411875366, + "learning_rate": 8.44576842904016e-06, + "loss": 0.2434, + "num_tokens": 2701613737.0, + "step": 3542 + }, + { + "epoch": 4.83436996575379, + "grad_norm": 0.18860276815919091, + "learning_rate": 8.441002334560697e-06, + "loss": 0.2299, + "num_tokens": 2702398696.0, + "step": 3543 + }, + { + "epoch": 4.8357355467124705, + "grad_norm": 0.1953065425890423, + "learning_rate": 8.436237020720606e-06, + "loss": 0.2462, + "num_tokens": 2703244713.0, + "step": 3544 + }, + { + "epoch": 4.837101127671151, + "grad_norm": 0.20072072385286305, + "learning_rate": 8.43147248897357e-06, + "loss": 0.2272, + "num_tokens": 2703977497.0, + "step": 3545 + }, + { + "epoch": 4.838466708629832, + "grad_norm": 0.19756211446466596, + "learning_rate": 8.426708740773042e-06, + "loss": 0.2582, + "num_tokens": 2704826854.0, + "step": 3546 + }, + { + "epoch": 4.839832289588512, + "grad_norm": 0.21052234791682978, + "learning_rate": 8.42194577757224e-06, + "loss": 0.2455, + "num_tokens": 2705615619.0, + "step": 3547 + }, + { + "epoch": 4.841197870547193, + "grad_norm": 0.1964855047985355, + "learning_rate": 8.417183600824132e-06, + "loss": 0.245, + "num_tokens": 2706392137.0, + "step": 3548 + }, + { + "epoch": 4.842563451505873, + "grad_norm": 0.21128157463535954, + "learning_rate": 8.41242221198146e-06, + "loss": 0.2505, + "num_tokens": 2707169899.0, + "step": 3549 + }, + { + "epoch": 4.843929032464554, + "grad_norm": 0.21304770815320903, + "learning_rate": 8.40766161249671e-06, + "loss": 0.2337, + "num_tokens": 2707889627.0, + "step": 3550 + }, + { + "epoch": 4.845294613423234, + "grad_norm": 0.20810300247801905, + "learning_rate": 8.402901803822137e-06, + "loss": 0.2487, + "num_tokens": 2708638035.0, + "step": 3551 + }, + { + "epoch": 4.846660194381915, + "grad_norm": 0.20006911548321932, + "learning_rate": 8.398142787409753e-06, + "loss": 0.2366, + "num_tokens": 2709405308.0, + "step": 3552 + }, + { + "epoch": 4.848025775340595, + "grad_norm": 0.2045834627363637, + "learning_rate": 8.39338456471133e-06, + "loss": 0.2425, + "num_tokens": 2710251963.0, + "step": 3553 + }, + { + "epoch": 4.849391356299275, + "grad_norm": 0.20023266236681284, + "learning_rate": 8.38862713717839e-06, + "loss": 0.2539, + "num_tokens": 2711018406.0, + "step": 3554 + }, + { + "epoch": 4.850756937257956, + "grad_norm": 0.19065493841653242, + "learning_rate": 8.383870506262222e-06, + "loss": 0.2266, + "num_tokens": 2711759197.0, + "step": 3555 + }, + { + "epoch": 4.852122518216636, + "grad_norm": 0.22519587698657928, + "learning_rate": 8.37911467341387e-06, + "loss": 0.2552, + "num_tokens": 2712655373.0, + "step": 3556 + }, + { + "epoch": 4.853488099175317, + "grad_norm": 0.19900917809307814, + "learning_rate": 8.374359640084126e-06, + "loss": 0.2369, + "num_tokens": 2713446651.0, + "step": 3557 + }, + { + "epoch": 4.854853680133997, + "grad_norm": 0.20971205955338743, + "learning_rate": 8.369605407723553e-06, + "loss": 0.2471, + "num_tokens": 2714220570.0, + "step": 3558 + }, + { + "epoch": 4.856219261092678, + "grad_norm": 0.19241310772989498, + "learning_rate": 8.364851977782455e-06, + "loss": 0.2353, + "num_tokens": 2714983794.0, + "step": 3559 + }, + { + "epoch": 4.8575848420513585, + "grad_norm": 0.19908797912971923, + "learning_rate": 8.360099351710904e-06, + "loss": 0.2281, + "num_tokens": 2715720729.0, + "step": 3560 + }, + { + "epoch": 4.8589504230100395, + "grad_norm": 0.20084692782198404, + "learning_rate": 8.355347530958715e-06, + "loss": 0.2395, + "num_tokens": 2716502020.0, + "step": 3561 + }, + { + "epoch": 4.86031600396872, + "grad_norm": 0.2546755946201292, + "learning_rate": 8.350596516975462e-06, + "loss": 0.2354, + "num_tokens": 2717196436.0, + "step": 3562 + }, + { + "epoch": 4.8616815849274, + "grad_norm": 0.19918458987010348, + "learning_rate": 8.345846311210477e-06, + "loss": 0.2431, + "num_tokens": 2717925040.0, + "step": 3563 + }, + { + "epoch": 4.863047165886081, + "grad_norm": 0.2062546531429062, + "learning_rate": 8.341096915112839e-06, + "loss": 0.2301, + "num_tokens": 2718688108.0, + "step": 3564 + }, + { + "epoch": 4.864412746844761, + "grad_norm": 0.18615863063160107, + "learning_rate": 8.336348330131387e-06, + "loss": 0.2521, + "num_tokens": 2719534484.0, + "step": 3565 + }, + { + "epoch": 4.865778327803442, + "grad_norm": 0.19418140031904302, + "learning_rate": 8.331600557714705e-06, + "loss": 0.238, + "num_tokens": 2720348891.0, + "step": 3566 + }, + { + "epoch": 4.867143908762122, + "grad_norm": 0.20398233599754886, + "learning_rate": 8.326853599311136e-06, + "loss": 0.2281, + "num_tokens": 2721079414.0, + "step": 3567 + }, + { + "epoch": 4.868509489720803, + "grad_norm": 0.19131927059195475, + "learning_rate": 8.32210745636877e-06, + "loss": 0.2419, + "num_tokens": 2721833865.0, + "step": 3568 + }, + { + "epoch": 4.869875070679483, + "grad_norm": 0.223694240397884, + "learning_rate": 8.317362130335447e-06, + "loss": 0.2451, + "num_tokens": 2722596641.0, + "step": 3569 + }, + { + "epoch": 4.871240651638164, + "grad_norm": 0.20246446775712576, + "learning_rate": 8.312617622658765e-06, + "loss": 0.2484, + "num_tokens": 2723309796.0, + "step": 3570 + }, + { + "epoch": 4.872606232596844, + "grad_norm": 0.21352076980869142, + "learning_rate": 8.307873934786064e-06, + "loss": 0.2374, + "num_tokens": 2724062240.0, + "step": 3571 + }, + { + "epoch": 4.873971813555524, + "grad_norm": 0.20839495858598703, + "learning_rate": 8.303131068164441e-06, + "loss": 0.2391, + "num_tokens": 2724784911.0, + "step": 3572 + }, + { + "epoch": 4.875337394514205, + "grad_norm": 0.19283403743920724, + "learning_rate": 8.298389024240734e-06, + "loss": 0.2463, + "num_tokens": 2725660346.0, + "step": 3573 + }, + { + "epoch": 4.8767029754728854, + "grad_norm": 0.2188975380251881, + "learning_rate": 8.293647804461544e-06, + "loss": 0.2393, + "num_tokens": 2726467827.0, + "step": 3574 + }, + { + "epoch": 4.8780685564315664, + "grad_norm": 0.1930371054457472, + "learning_rate": 8.288907410273204e-06, + "loss": 0.2524, + "num_tokens": 2727285599.0, + "step": 3575 + }, + { + "epoch": 4.879434137390247, + "grad_norm": 0.20007964542876167, + "learning_rate": 8.284167843121804e-06, + "loss": 0.2398, + "num_tokens": 2728032046.0, + "step": 3576 + }, + { + "epoch": 4.880799718348928, + "grad_norm": 0.21066604275697146, + "learning_rate": 8.279429104453188e-06, + "loss": 0.2463, + "num_tokens": 2728796451.0, + "step": 3577 + }, + { + "epoch": 4.882165299307608, + "grad_norm": 0.19351901068068902, + "learning_rate": 8.27469119571293e-06, + "loss": 0.2314, + "num_tokens": 2729582635.0, + "step": 3578 + }, + { + "epoch": 4.883530880266289, + "grad_norm": 0.19368512619485292, + "learning_rate": 8.269954118346366e-06, + "loss": 0.2385, + "num_tokens": 2730321865.0, + "step": 3579 + }, + { + "epoch": 4.884896461224969, + "grad_norm": 0.19584588414226956, + "learning_rate": 8.265217873798572e-06, + "loss": 0.2282, + "num_tokens": 2731093657.0, + "step": 3580 + }, + { + "epoch": 4.886262042183649, + "grad_norm": 0.1965614017612995, + "learning_rate": 8.260482463514371e-06, + "loss": 0.2529, + "num_tokens": 2731884885.0, + "step": 3581 + }, + { + "epoch": 4.88762762314233, + "grad_norm": 0.1992329216673602, + "learning_rate": 8.25574788893833e-06, + "loss": 0.2369, + "num_tokens": 2732693043.0, + "step": 3582 + }, + { + "epoch": 4.88899320410101, + "grad_norm": 0.20053416948486844, + "learning_rate": 8.251014151514771e-06, + "loss": 0.239, + "num_tokens": 2733435647.0, + "step": 3583 + }, + { + "epoch": 4.890358785059691, + "grad_norm": 0.209623049122877, + "learning_rate": 8.246281252687742e-06, + "loss": 0.2509, + "num_tokens": 2734171505.0, + "step": 3584 + }, + { + "epoch": 4.891724366018371, + "grad_norm": 0.21257120346203087, + "learning_rate": 8.241549193901049e-06, + "loss": 0.2469, + "num_tokens": 2734883730.0, + "step": 3585 + }, + { + "epoch": 4.893089946977052, + "grad_norm": 0.21156674517408608, + "learning_rate": 8.23681797659824e-06, + "loss": 0.2386, + "num_tokens": 2735634590.0, + "step": 3586 + }, + { + "epoch": 4.894455527935732, + "grad_norm": 0.20416121945755328, + "learning_rate": 8.2320876022226e-06, + "loss": 0.2394, + "num_tokens": 2736399928.0, + "step": 3587 + }, + { + "epoch": 4.895821108894413, + "grad_norm": 0.19582114535550088, + "learning_rate": 8.227358072217167e-06, + "loss": 0.2439, + "num_tokens": 2737162876.0, + "step": 3588 + }, + { + "epoch": 4.897186689853093, + "grad_norm": 0.2062678687483563, + "learning_rate": 8.222629388024711e-06, + "loss": 0.2453, + "num_tokens": 2737920368.0, + "step": 3589 + }, + { + "epoch": 4.8985522708117735, + "grad_norm": 0.2102704331784953, + "learning_rate": 8.217901551087754e-06, + "loss": 0.2458, + "num_tokens": 2738693557.0, + "step": 3590 + }, + { + "epoch": 4.8999178517704545, + "grad_norm": 0.23030925621433862, + "learning_rate": 8.213174562848549e-06, + "loss": 0.2491, + "num_tokens": 2739477387.0, + "step": 3591 + }, + { + "epoch": 4.901283432729135, + "grad_norm": 0.20493383589286573, + "learning_rate": 8.208448424749102e-06, + "loss": 0.2503, + "num_tokens": 2740316512.0, + "step": 3592 + }, + { + "epoch": 4.902649013687816, + "grad_norm": 0.21124112920339275, + "learning_rate": 8.203723138231146e-06, + "loss": 0.236, + "num_tokens": 2741100699.0, + "step": 3593 + }, + { + "epoch": 4.904014594646496, + "grad_norm": 0.22236274232466346, + "learning_rate": 8.198998704736162e-06, + "loss": 0.2466, + "num_tokens": 2741911977.0, + "step": 3594 + }, + { + "epoch": 4.905380175605177, + "grad_norm": 0.1981689406096567, + "learning_rate": 8.194275125705375e-06, + "loss": 0.2301, + "num_tokens": 2742631056.0, + "step": 3595 + }, + { + "epoch": 4.906745756563857, + "grad_norm": 0.2076734004687157, + "learning_rate": 8.18955240257974e-06, + "loss": 0.2457, + "num_tokens": 2743348445.0, + "step": 3596 + }, + { + "epoch": 4.908111337522538, + "grad_norm": 0.2026126275998498, + "learning_rate": 8.184830536799956e-06, + "loss": 0.2446, + "num_tokens": 2744105719.0, + "step": 3597 + }, + { + "epoch": 4.909476918481218, + "grad_norm": 0.20359418709848537, + "learning_rate": 8.180109529806462e-06, + "loss": 0.24, + "num_tokens": 2744835988.0, + "step": 3598 + }, + { + "epoch": 4.910842499439898, + "grad_norm": 0.20000584641532482, + "learning_rate": 8.175389383039433e-06, + "loss": 0.2319, + "num_tokens": 2745620653.0, + "step": 3599 + }, + { + "epoch": 4.912208080398579, + "grad_norm": 0.21431804006258237, + "learning_rate": 8.170670097938778e-06, + "loss": 0.2407, + "num_tokens": 2746332298.0, + "step": 3600 + }, + { + "epoch": 4.913573661357259, + "grad_norm": 0.20882561410743025, + "learning_rate": 8.165951675944147e-06, + "loss": 0.2369, + "num_tokens": 2747033935.0, + "step": 3601 + }, + { + "epoch": 4.91493924231594, + "grad_norm": 0.19252495256970326, + "learning_rate": 8.161234118494933e-06, + "loss": 0.2278, + "num_tokens": 2747729987.0, + "step": 3602 + }, + { + "epoch": 4.91630482327462, + "grad_norm": 0.2130427304366715, + "learning_rate": 8.156517427030245e-06, + "loss": 0.2475, + "num_tokens": 2748484715.0, + "step": 3603 + }, + { + "epoch": 4.917670404233301, + "grad_norm": 0.2087640718858639, + "learning_rate": 8.151801602988953e-06, + "loss": 0.2513, + "num_tokens": 2749251234.0, + "step": 3604 + }, + { + "epoch": 4.919035985191981, + "grad_norm": 0.1975859531251446, + "learning_rate": 8.147086647809648e-06, + "loss": 0.2441, + "num_tokens": 2749986031.0, + "step": 3605 + }, + { + "epoch": 4.920401566150662, + "grad_norm": 0.21768128006095877, + "learning_rate": 8.142372562930657e-06, + "loss": 0.2376, + "num_tokens": 2750693276.0, + "step": 3606 + }, + { + "epoch": 4.9217671471093425, + "grad_norm": 0.2373770240738591, + "learning_rate": 8.137659349790041e-06, + "loss": 0.2491, + "num_tokens": 2751419063.0, + "step": 3607 + }, + { + "epoch": 4.923132728068023, + "grad_norm": 0.19890204640876294, + "learning_rate": 8.132947009825602e-06, + "loss": 0.2417, + "num_tokens": 2752235292.0, + "step": 3608 + }, + { + "epoch": 4.924498309026704, + "grad_norm": 0.1972316863217128, + "learning_rate": 8.128235544474871e-06, + "loss": 0.2356, + "num_tokens": 2752981819.0, + "step": 3609 + }, + { + "epoch": 4.925863889985384, + "grad_norm": 0.19256449218996477, + "learning_rate": 8.123524955175106e-06, + "loss": 0.2423, + "num_tokens": 2753737463.0, + "step": 3610 + }, + { + "epoch": 4.927229470944065, + "grad_norm": 0.20991514573889616, + "learning_rate": 8.118815243363312e-06, + "loss": 0.2386, + "num_tokens": 2754539426.0, + "step": 3611 + }, + { + "epoch": 4.928595051902745, + "grad_norm": 0.19130649572684622, + "learning_rate": 8.114106410476207e-06, + "loss": 0.237, + "num_tokens": 2755325882.0, + "step": 3612 + }, + { + "epoch": 4.929960632861426, + "grad_norm": 0.20109560292562279, + "learning_rate": 8.109398457950266e-06, + "loss": 0.2352, + "num_tokens": 2756043334.0, + "step": 3613 + }, + { + "epoch": 4.931326213820106, + "grad_norm": 0.20361907892763292, + "learning_rate": 8.10469138722167e-06, + "loss": 0.2361, + "num_tokens": 2756755073.0, + "step": 3614 + }, + { + "epoch": 4.932691794778787, + "grad_norm": 0.1976542993519808, + "learning_rate": 8.099985199726348e-06, + "loss": 0.2351, + "num_tokens": 2757492095.0, + "step": 3615 + }, + { + "epoch": 4.934057375737467, + "grad_norm": 0.20369470167830078, + "learning_rate": 8.095279896899953e-06, + "loss": 0.2394, + "num_tokens": 2758317606.0, + "step": 3616 + }, + { + "epoch": 4.935422956696147, + "grad_norm": 0.21028162628745248, + "learning_rate": 8.090575480177867e-06, + "loss": 0.2364, + "num_tokens": 2759043312.0, + "step": 3617 + }, + { + "epoch": 4.936788537654828, + "grad_norm": 0.18853819557674492, + "learning_rate": 8.085871950995207e-06, + "loss": 0.2507, + "num_tokens": 2759838458.0, + "step": 3618 + }, + { + "epoch": 4.938154118613508, + "grad_norm": 0.2075366540724957, + "learning_rate": 8.081169310786812e-06, + "loss": 0.2309, + "num_tokens": 2760573341.0, + "step": 3619 + }, + { + "epoch": 4.939519699572189, + "grad_norm": 0.19610952032696138, + "learning_rate": 8.076467560987262e-06, + "loss": 0.2379, + "num_tokens": 2761411063.0, + "step": 3620 + }, + { + "epoch": 4.940885280530869, + "grad_norm": 0.21686098564168244, + "learning_rate": 8.071766703030846e-06, + "loss": 0.2434, + "num_tokens": 2762103709.0, + "step": 3621 + }, + { + "epoch": 4.94225086148955, + "grad_norm": 0.19040258393560872, + "learning_rate": 8.067066738351601e-06, + "loss": 0.2429, + "num_tokens": 2762953505.0, + "step": 3622 + }, + { + "epoch": 4.9436164424482305, + "grad_norm": 0.20654010294140168, + "learning_rate": 8.062367668383279e-06, + "loss": 0.2288, + "num_tokens": 2763703710.0, + "step": 3623 + }, + { + "epoch": 4.9449820234069115, + "grad_norm": 0.20938723023885195, + "learning_rate": 8.05766949455936e-06, + "loss": 0.2568, + "num_tokens": 2764462657.0, + "step": 3624 + }, + { + "epoch": 4.946347604365592, + "grad_norm": 0.19743299926643723, + "learning_rate": 8.052972218313061e-06, + "loss": 0.2363, + "num_tokens": 2765188444.0, + "step": 3625 + }, + { + "epoch": 4.947713185324272, + "grad_norm": 0.20091260814849643, + "learning_rate": 8.048275841077313e-06, + "loss": 0.2438, + "num_tokens": 2765965507.0, + "step": 3626 + }, + { + "epoch": 4.949078766282953, + "grad_norm": 0.20382566711581449, + "learning_rate": 8.043580364284778e-06, + "loss": 0.252, + "num_tokens": 2766842067.0, + "step": 3627 + }, + { + "epoch": 4.950444347241633, + "grad_norm": 0.19639852185672865, + "learning_rate": 8.038885789367838e-06, + "loss": 0.232, + "num_tokens": 2767582428.0, + "step": 3628 + }, + { + "epoch": 4.951809928200314, + "grad_norm": 0.20249794311792543, + "learning_rate": 8.034192117758613e-06, + "loss": 0.2463, + "num_tokens": 2768317341.0, + "step": 3629 + }, + { + "epoch": 4.953175509158994, + "grad_norm": 0.21634909101175917, + "learning_rate": 8.029499350888932e-06, + "loss": 0.2357, + "num_tokens": 2769082895.0, + "step": 3630 + }, + { + "epoch": 4.954541090117675, + "grad_norm": 0.1883339979845605, + "learning_rate": 8.024807490190361e-06, + "loss": 0.2381, + "num_tokens": 2769872620.0, + "step": 3631 + }, + { + "epoch": 4.955906671076355, + "grad_norm": 0.2066586367517298, + "learning_rate": 8.020116537094178e-06, + "loss": 0.2493, + "num_tokens": 2770615475.0, + "step": 3632 + }, + { + "epoch": 4.957272252035036, + "grad_norm": 0.21854021752679548, + "learning_rate": 8.015426493031391e-06, + "loss": 0.2493, + "num_tokens": 2771407134.0, + "step": 3633 + }, + { + "epoch": 4.958637832993716, + "grad_norm": 0.18771337869133536, + "learning_rate": 8.010737359432731e-06, + "loss": 0.229, + "num_tokens": 2772155870.0, + "step": 3634 + }, + { + "epoch": 4.960003413952396, + "grad_norm": 0.19958105260276332, + "learning_rate": 8.006049137728648e-06, + "loss": 0.2396, + "num_tokens": 2772910956.0, + "step": 3635 + }, + { + "epoch": 4.961368994911077, + "grad_norm": 0.19672794892942547, + "learning_rate": 8.00136182934932e-06, + "loss": 0.2357, + "num_tokens": 2773711165.0, + "step": 3636 + }, + { + "epoch": 4.962734575869757, + "grad_norm": 0.20135473420140793, + "learning_rate": 7.99667543572463e-06, + "loss": 0.2352, + "num_tokens": 2774518130.0, + "step": 3637 + }, + { + "epoch": 4.964100156828438, + "grad_norm": 0.20136316220590142, + "learning_rate": 7.991989958284205e-06, + "loss": 0.2382, + "num_tokens": 2775271792.0, + "step": 3638 + }, + { + "epoch": 4.9654657377871185, + "grad_norm": 0.20712621670129858, + "learning_rate": 7.987305398457383e-06, + "loss": 0.242, + "num_tokens": 2776003204.0, + "step": 3639 + }, + { + "epoch": 4.9668313187457995, + "grad_norm": 0.2513608497472904, + "learning_rate": 7.982621757673209e-06, + "loss": 0.2258, + "num_tokens": 2776781439.0, + "step": 3640 + }, + { + "epoch": 4.96819689970448, + "grad_norm": 0.1957552292277791, + "learning_rate": 7.977939037360469e-06, + "loss": 0.2407, + "num_tokens": 2777534767.0, + "step": 3641 + }, + { + "epoch": 4.969562480663161, + "grad_norm": 0.19853809981331919, + "learning_rate": 7.973257238947652e-06, + "loss": 0.2396, + "num_tokens": 2778328549.0, + "step": 3642 + }, + { + "epoch": 4.970928061621841, + "grad_norm": 0.19211875610601312, + "learning_rate": 7.968576363862977e-06, + "loss": 0.2312, + "num_tokens": 2779012715.0, + "step": 3643 + }, + { + "epoch": 4.972293642580521, + "grad_norm": 0.20333173147422934, + "learning_rate": 7.963896413534369e-06, + "loss": 0.2324, + "num_tokens": 2779781463.0, + "step": 3644 + }, + { + "epoch": 4.973659223539202, + "grad_norm": 0.19915330485137925, + "learning_rate": 7.959217389389487e-06, + "loss": 0.232, + "num_tokens": 2780532354.0, + "step": 3645 + }, + { + "epoch": 4.975024804497882, + "grad_norm": 0.19389389625179473, + "learning_rate": 7.95453929285569e-06, + "loss": 0.2266, + "num_tokens": 2781274866.0, + "step": 3646 + }, + { + "epoch": 4.976390385456563, + "grad_norm": 0.19758436608221389, + "learning_rate": 7.94986212536007e-06, + "loss": 0.241, + "num_tokens": 2782104841.0, + "step": 3647 + }, + { + "epoch": 4.977755966415243, + "grad_norm": 0.19776832512293066, + "learning_rate": 7.945185888329429e-06, + "loss": 0.243, + "num_tokens": 2782912449.0, + "step": 3648 + }, + { + "epoch": 4.979121547373924, + "grad_norm": 0.20407772296522966, + "learning_rate": 7.940510583190274e-06, + "loss": 0.2456, + "num_tokens": 2783653345.0, + "step": 3649 + }, + { + "epoch": 4.980487128332604, + "grad_norm": 0.2075367628776983, + "learning_rate": 7.935836211368849e-06, + "loss": 0.2411, + "num_tokens": 2784429840.0, + "step": 3650 + }, + { + "epoch": 4.981852709291285, + "grad_norm": 0.19355230860662603, + "learning_rate": 7.931162774291097e-06, + "loss": 0.2427, + "num_tokens": 2785198177.0, + "step": 3651 + }, + { + "epoch": 4.983218290249965, + "grad_norm": 0.21356980656480173, + "learning_rate": 7.926490273382683e-06, + "loss": 0.2506, + "num_tokens": 2786001429.0, + "step": 3652 + }, + { + "epoch": 4.984583871208645, + "grad_norm": 0.19281030397362578, + "learning_rate": 7.921818710068982e-06, + "loss": 0.2383, + "num_tokens": 2786759508.0, + "step": 3653 + }, + { + "epoch": 4.985949452167326, + "grad_norm": 0.21570276716270875, + "learning_rate": 7.917148085775093e-06, + "loss": 0.2426, + "num_tokens": 2787496913.0, + "step": 3654 + }, + { + "epoch": 4.9873150331260065, + "grad_norm": 0.2313915573775155, + "learning_rate": 7.912478401925811e-06, + "loss": 0.2321, + "num_tokens": 2788260660.0, + "step": 3655 + }, + { + "epoch": 4.9886806140846875, + "grad_norm": 0.20281441677370476, + "learning_rate": 7.907809659945658e-06, + "loss": 0.2333, + "num_tokens": 2789039060.0, + "step": 3656 + }, + { + "epoch": 4.990046195043368, + "grad_norm": 0.20232074065909583, + "learning_rate": 7.903141861258873e-06, + "loss": 0.245, + "num_tokens": 2789797427.0, + "step": 3657 + }, + { + "epoch": 4.991411776002049, + "grad_norm": 0.20535304228083365, + "learning_rate": 7.898475007289388e-06, + "loss": 0.2288, + "num_tokens": 2790494979.0, + "step": 3658 + }, + { + "epoch": 4.992777356960729, + "grad_norm": 0.27039141867063016, + "learning_rate": 7.893809099460865e-06, + "loss": 0.2564, + "num_tokens": 2791259668.0, + "step": 3659 + }, + { + "epoch": 4.99414293791941, + "grad_norm": 0.2155714131010695, + "learning_rate": 7.889144139196665e-06, + "loss": 0.2482, + "num_tokens": 2792032408.0, + "step": 3660 + }, + { + "epoch": 4.99550851887809, + "grad_norm": 0.21672663142302623, + "learning_rate": 7.884480127919872e-06, + "loss": 0.237, + "num_tokens": 2792767945.0, + "step": 3661 + }, + { + "epoch": 4.99687409983677, + "grad_norm": 0.21372977274626537, + "learning_rate": 7.879817067053268e-06, + "loss": 0.2414, + "num_tokens": 2793428592.0, + "step": 3662 + }, + { + "epoch": 4.998239680795451, + "grad_norm": 0.2059980092405366, + "learning_rate": 7.875154958019354e-06, + "loss": 0.2406, + "num_tokens": 2794167181.0, + "step": 3663 + }, + { + "epoch": 4.999605261754131, + "grad_norm": 0.19841216371331458, + "learning_rate": 7.870493802240335e-06, + "loss": 0.2482, + "num_tokens": 2794978290.0, + "step": 3664 + }, + { + "epoch": 5.0, + "grad_norm": 0.19841216371331458, + "learning_rate": 7.86583360113813e-06, + "loss": 0.2267, + "num_tokens": 2795210680.0, + "step": 3665 + }, + { + "epoch": 5.00136558095868, + "grad_norm": 0.4001628279588914, + "learning_rate": 7.861174356134365e-06, + "loss": 0.1995, + "num_tokens": 2795964972.0, + "step": 3666 + }, + { + "epoch": 5.002731161917361, + "grad_norm": 0.34072026445612424, + "learning_rate": 7.856516068650373e-06, + "loss": 0.2018, + "num_tokens": 2796643409.0, + "step": 3667 + }, + { + "epoch": 5.004096742876041, + "grad_norm": 0.3275820481854563, + "learning_rate": 7.851858740107196e-06, + "loss": 0.219, + "num_tokens": 2797453033.0, + "step": 3668 + }, + { + "epoch": 5.005462323834722, + "grad_norm": 0.2324079320253321, + "learning_rate": 7.847202371925581e-06, + "loss": 0.199, + "num_tokens": 2798174075.0, + "step": 3669 + }, + { + "epoch": 5.006827904793402, + "grad_norm": 0.25975409431423935, + "learning_rate": 7.84254696552599e-06, + "loss": 0.2022, + "num_tokens": 2799004723.0, + "step": 3670 + }, + { + "epoch": 5.008193485752083, + "grad_norm": 0.34408952507927093, + "learning_rate": 7.83789252232858e-06, + "loss": 0.2026, + "num_tokens": 2799755750.0, + "step": 3671 + }, + { + "epoch": 5.0095590667107635, + "grad_norm": 0.3189497281640464, + "learning_rate": 7.833239043753222e-06, + "loss": 0.2066, + "num_tokens": 2800564904.0, + "step": 3672 + }, + { + "epoch": 5.0109246476694445, + "grad_norm": 0.3053696020576775, + "learning_rate": 7.828586531219493e-06, + "loss": 0.1954, + "num_tokens": 2801252036.0, + "step": 3673 + }, + { + "epoch": 5.012290228628125, + "grad_norm": 0.26771805028266477, + "learning_rate": 7.823934986146667e-06, + "loss": 0.2021, + "num_tokens": 2802019845.0, + "step": 3674 + }, + { + "epoch": 5.013655809586805, + "grad_norm": 0.2901828574004597, + "learning_rate": 7.819284409953737e-06, + "loss": 0.2013, + "num_tokens": 2802841037.0, + "step": 3675 + }, + { + "epoch": 5.015021390545486, + "grad_norm": 0.3024580261593043, + "learning_rate": 7.814634804059384e-06, + "loss": 0.1999, + "num_tokens": 2803582236.0, + "step": 3676 + }, + { + "epoch": 5.016386971504166, + "grad_norm": 0.24260497397171676, + "learning_rate": 7.809986169882012e-06, + "loss": 0.2019, + "num_tokens": 2804409437.0, + "step": 3677 + }, + { + "epoch": 5.017752552462847, + "grad_norm": 0.23583735591170157, + "learning_rate": 7.805338508839709e-06, + "loss": 0.2062, + "num_tokens": 2805154820.0, + "step": 3678 + }, + { + "epoch": 5.019118133421527, + "grad_norm": 0.2654460919568417, + "learning_rate": 7.800691822350277e-06, + "loss": 0.2043, + "num_tokens": 2805941653.0, + "step": 3679 + }, + { + "epoch": 5.020483714380208, + "grad_norm": 0.23139268627089235, + "learning_rate": 7.796046111831221e-06, + "loss": 0.2022, + "num_tokens": 2806774308.0, + "step": 3680 + }, + { + "epoch": 5.021849295338888, + "grad_norm": 0.22902000343778345, + "learning_rate": 7.791401378699743e-06, + "loss": 0.2118, + "num_tokens": 2807512591.0, + "step": 3681 + }, + { + "epoch": 5.023214876297569, + "grad_norm": 0.23770470185857015, + "learning_rate": 7.786757624372753e-06, + "loss": 0.201, + "num_tokens": 2808283677.0, + "step": 3682 + }, + { + "epoch": 5.024580457256249, + "grad_norm": 0.2146078958051942, + "learning_rate": 7.782114850266856e-06, + "loss": 0.1975, + "num_tokens": 2808976730.0, + "step": 3683 + }, + { + "epoch": 5.025946038214929, + "grad_norm": 0.23351771949925965, + "learning_rate": 7.777473057798366e-06, + "loss": 0.2015, + "num_tokens": 2809735010.0, + "step": 3684 + }, + { + "epoch": 5.02731161917361, + "grad_norm": 0.24001051018190273, + "learning_rate": 7.772832248383291e-06, + "loss": 0.2027, + "num_tokens": 2810470048.0, + "step": 3685 + }, + { + "epoch": 5.02867720013229, + "grad_norm": 0.22526015829022114, + "learning_rate": 7.76819242343734e-06, + "loss": 0.2033, + "num_tokens": 2811258588.0, + "step": 3686 + }, + { + "epoch": 5.030042781090971, + "grad_norm": 0.22520778826825588, + "learning_rate": 7.763553584375922e-06, + "loss": 0.2021, + "num_tokens": 2812074882.0, + "step": 3687 + }, + { + "epoch": 5.0314083620496515, + "grad_norm": 0.2348953482507149, + "learning_rate": 7.758915732614145e-06, + "loss": 0.1904, + "num_tokens": 2812746233.0, + "step": 3688 + }, + { + "epoch": 5.0327739430083325, + "grad_norm": 0.21166466512424384, + "learning_rate": 7.754278869566823e-06, + "loss": 0.2085, + "num_tokens": 2813594249.0, + "step": 3689 + }, + { + "epoch": 5.034139523967013, + "grad_norm": 0.2276856315949832, + "learning_rate": 7.74964299664845e-06, + "loss": 0.1896, + "num_tokens": 2814329150.0, + "step": 3690 + }, + { + "epoch": 5.035505104925694, + "grad_norm": 0.2286901139049727, + "learning_rate": 7.745008115273245e-06, + "loss": 0.1969, + "num_tokens": 2815023690.0, + "step": 3691 + }, + { + "epoch": 5.036870685884374, + "grad_norm": 0.21667529143283884, + "learning_rate": 7.740374226855097e-06, + "loss": 0.1963, + "num_tokens": 2815717499.0, + "step": 3692 + }, + { + "epoch": 5.038236266843054, + "grad_norm": 0.21341328472382612, + "learning_rate": 7.735741332807615e-06, + "loss": 0.2078, + "num_tokens": 2816481407.0, + "step": 3693 + }, + { + "epoch": 5.039601847801735, + "grad_norm": 0.21522423298526153, + "learning_rate": 7.731109434544088e-06, + "loss": 0.1915, + "num_tokens": 2817263917.0, + "step": 3694 + }, + { + "epoch": 5.040967428760415, + "grad_norm": 0.2196623554731143, + "learning_rate": 7.726478533477506e-06, + "loss": 0.2038, + "num_tokens": 2818118964.0, + "step": 3695 + }, + { + "epoch": 5.042333009719096, + "grad_norm": 0.2001364440613833, + "learning_rate": 7.721848631020563e-06, + "loss": 0.211, + "num_tokens": 2818851156.0, + "step": 3696 + }, + { + "epoch": 5.043698590677776, + "grad_norm": 0.22467046803907847, + "learning_rate": 7.717219728585637e-06, + "loss": 0.2064, + "num_tokens": 2819665255.0, + "step": 3697 + }, + { + "epoch": 5.045064171636457, + "grad_norm": 0.20383084349992897, + "learning_rate": 7.712591827584808e-06, + "loss": 0.1943, + "num_tokens": 2820434186.0, + "step": 3698 + }, + { + "epoch": 5.046429752595137, + "grad_norm": 0.19247511422154817, + "learning_rate": 7.70796492942984e-06, + "loss": 0.2064, + "num_tokens": 2821179220.0, + "step": 3699 + }, + { + "epoch": 5.047795333553818, + "grad_norm": 0.23128810215845758, + "learning_rate": 7.703339035532215e-06, + "loss": 0.1997, + "num_tokens": 2821918596.0, + "step": 3700 + }, + { + "epoch": 5.049160914512498, + "grad_norm": 0.21261078963470473, + "learning_rate": 7.698714147303081e-06, + "loss": 0.1963, + "num_tokens": 2822697516.0, + "step": 3701 + }, + { + "epoch": 5.050526495471178, + "grad_norm": 0.21669756619015867, + "learning_rate": 7.69409026615329e-06, + "loss": 0.1973, + "num_tokens": 2823421267.0, + "step": 3702 + }, + { + "epoch": 5.051892076429859, + "grad_norm": 0.20975109873183104, + "learning_rate": 7.689467393493397e-06, + "loss": 0.2073, + "num_tokens": 2824164920.0, + "step": 3703 + }, + { + "epoch": 5.0532576573885395, + "grad_norm": 0.2205330293566889, + "learning_rate": 7.684845530733634e-06, + "loss": 0.2024, + "num_tokens": 2824857444.0, + "step": 3704 + }, + { + "epoch": 5.0546232383472205, + "grad_norm": 0.22427808080102965, + "learning_rate": 7.680224679283932e-06, + "loss": 0.2024, + "num_tokens": 2825571935.0, + "step": 3705 + }, + { + "epoch": 5.055988819305901, + "grad_norm": 0.20347876564801795, + "learning_rate": 7.675604840553912e-06, + "loss": 0.2057, + "num_tokens": 2826424442.0, + "step": 3706 + }, + { + "epoch": 5.057354400264582, + "grad_norm": 0.2274325552370595, + "learning_rate": 7.67098601595289e-06, + "loss": 0.2156, + "num_tokens": 2827244630.0, + "step": 3707 + }, + { + "epoch": 5.058719981223262, + "grad_norm": 0.2268840569569265, + "learning_rate": 7.666368206889863e-06, + "loss": 0.2017, + "num_tokens": 2827980597.0, + "step": 3708 + }, + { + "epoch": 5.060085562181943, + "grad_norm": 0.22273853802410284, + "learning_rate": 7.661751414773534e-06, + "loss": 0.2026, + "num_tokens": 2828694470.0, + "step": 3709 + }, + { + "epoch": 5.061451143140623, + "grad_norm": 0.22282824953628386, + "learning_rate": 7.65713564101228e-06, + "loss": 0.1957, + "num_tokens": 2829360585.0, + "step": 3710 + }, + { + "epoch": 5.062816724099303, + "grad_norm": 0.22424864057079963, + "learning_rate": 7.652520887014173e-06, + "loss": 0.2022, + "num_tokens": 2830203222.0, + "step": 3711 + }, + { + "epoch": 5.064182305057984, + "grad_norm": 0.226770367953203, + "learning_rate": 7.647907154186977e-06, + "loss": 0.2029, + "num_tokens": 2830991306.0, + "step": 3712 + }, + { + "epoch": 5.065547886016664, + "grad_norm": 0.23004367929525155, + "learning_rate": 7.64329444393814e-06, + "loss": 0.2039, + "num_tokens": 2831704948.0, + "step": 3713 + }, + { + "epoch": 5.066913466975345, + "grad_norm": 0.2277017091020458, + "learning_rate": 7.638682757674804e-06, + "loss": 0.2135, + "num_tokens": 2832444884.0, + "step": 3714 + }, + { + "epoch": 5.068279047934025, + "grad_norm": 0.22953839710862786, + "learning_rate": 7.634072096803791e-06, + "loss": 0.2024, + "num_tokens": 2833255268.0, + "step": 3715 + }, + { + "epoch": 5.069644628892706, + "grad_norm": 0.19573686310396027, + "learning_rate": 7.629462462731615e-06, + "loss": 0.1905, + "num_tokens": 2833992878.0, + "step": 3716 + }, + { + "epoch": 5.071010209851386, + "grad_norm": 0.21668899057666977, + "learning_rate": 7.62485385686448e-06, + "loss": 0.2121, + "num_tokens": 2834710899.0, + "step": 3717 + }, + { + "epoch": 5.072375790810067, + "grad_norm": 0.22868741849257335, + "learning_rate": 7.620246280608261e-06, + "loss": 0.2003, + "num_tokens": 2835427858.0, + "step": 3718 + }, + { + "epoch": 5.073741371768747, + "grad_norm": 0.19613813216364076, + "learning_rate": 7.615639735368545e-06, + "loss": 0.1963, + "num_tokens": 2836266860.0, + "step": 3719 + }, + { + "epoch": 5.0751069527274275, + "grad_norm": 0.20738454564924447, + "learning_rate": 7.611034222550575e-06, + "loss": 0.2089, + "num_tokens": 2837056039.0, + "step": 3720 + }, + { + "epoch": 5.0764725336861085, + "grad_norm": 0.2247428984936338, + "learning_rate": 7.606429743559305e-06, + "loss": 0.194, + "num_tokens": 2837827937.0, + "step": 3721 + }, + { + "epoch": 5.077838114644789, + "grad_norm": 0.20454990700879552, + "learning_rate": 7.601826299799353e-06, + "loss": 0.2045, + "num_tokens": 2838594254.0, + "step": 3722 + }, + { + "epoch": 5.07920369560347, + "grad_norm": 0.21193935293288915, + "learning_rate": 7.597223892675037e-06, + "loss": 0.2036, + "num_tokens": 2839353558.0, + "step": 3723 + }, + { + "epoch": 5.08056927656215, + "grad_norm": 0.22863906505878895, + "learning_rate": 7.592622523590349e-06, + "loss": 0.1957, + "num_tokens": 2840127778.0, + "step": 3724 + }, + { + "epoch": 5.081934857520831, + "grad_norm": 0.20401087580320995, + "learning_rate": 7.5880221939489695e-06, + "loss": 0.2014, + "num_tokens": 2840824736.0, + "step": 3725 + }, + { + "epoch": 5.083300438479511, + "grad_norm": 0.2333431824315937, + "learning_rate": 7.583422905154259e-06, + "loss": 0.2034, + "num_tokens": 2841547904.0, + "step": 3726 + }, + { + "epoch": 5.084666019438192, + "grad_norm": 0.22655433158373553, + "learning_rate": 7.578824658609254e-06, + "loss": 0.1961, + "num_tokens": 2842292439.0, + "step": 3727 + }, + { + "epoch": 5.086031600396872, + "grad_norm": 0.23722138169270146, + "learning_rate": 7.574227455716696e-06, + "loss": 0.197, + "num_tokens": 2843018473.0, + "step": 3728 + }, + { + "epoch": 5.087397181355552, + "grad_norm": 0.20288394990091863, + "learning_rate": 7.569631297878977e-06, + "loss": 0.2009, + "num_tokens": 2843820438.0, + "step": 3729 + }, + { + "epoch": 5.088762762314233, + "grad_norm": 0.2105290634998853, + "learning_rate": 7.565036186498196e-06, + "loss": 0.2066, + "num_tokens": 2844602083.0, + "step": 3730 + }, + { + "epoch": 5.090128343272913, + "grad_norm": 0.20658309011248943, + "learning_rate": 7.560442122976117e-06, + "loss": 0.2041, + "num_tokens": 2845430361.0, + "step": 3731 + }, + { + "epoch": 5.091493924231594, + "grad_norm": 0.22996818891697138, + "learning_rate": 7.555849108714192e-06, + "loss": 0.2062, + "num_tokens": 2846251497.0, + "step": 3732 + }, + { + "epoch": 5.092859505190274, + "grad_norm": 0.19887106858574144, + "learning_rate": 7.551257145113555e-06, + "loss": 0.1968, + "num_tokens": 2846985223.0, + "step": 3733 + }, + { + "epoch": 5.094225086148955, + "grad_norm": 0.22911570163904468, + "learning_rate": 7.546666233575004e-06, + "loss": 0.2034, + "num_tokens": 2847711644.0, + "step": 3734 + }, + { + "epoch": 5.095590667107635, + "grad_norm": 0.19554646022596142, + "learning_rate": 7.542076375499039e-06, + "loss": 0.2003, + "num_tokens": 2848517942.0, + "step": 3735 + }, + { + "epoch": 5.096956248066316, + "grad_norm": 0.2284781528615546, + "learning_rate": 7.537487572285816e-06, + "loss": 0.2097, + "num_tokens": 2849223412.0, + "step": 3736 + }, + { + "epoch": 5.0983218290249965, + "grad_norm": 0.2427992820342156, + "learning_rate": 7.532899825335191e-06, + "loss": 0.201, + "num_tokens": 2849990418.0, + "step": 3737 + }, + { + "epoch": 5.099687409983677, + "grad_norm": 0.20976585153755753, + "learning_rate": 7.528313136046679e-06, + "loss": 0.2038, + "num_tokens": 2850760436.0, + "step": 3738 + }, + { + "epoch": 5.101052990942358, + "grad_norm": 0.21762376143196477, + "learning_rate": 7.523727505819487e-06, + "loss": 0.1968, + "num_tokens": 2851452950.0, + "step": 3739 + }, + { + "epoch": 5.102418571901038, + "grad_norm": 0.2179676664339702, + "learning_rate": 7.519142936052486e-06, + "loss": 0.2084, + "num_tokens": 2852175881.0, + "step": 3740 + }, + { + "epoch": 5.103784152859719, + "grad_norm": 0.28221146791758867, + "learning_rate": 7.5145594281442326e-06, + "loss": 0.1978, + "num_tokens": 2852957481.0, + "step": 3741 + }, + { + "epoch": 5.105149733818399, + "grad_norm": 0.22843899646330948, + "learning_rate": 7.509976983492957e-06, + "loss": 0.2093, + "num_tokens": 2853750089.0, + "step": 3742 + }, + { + "epoch": 5.10651531477708, + "grad_norm": 0.21814535463044143, + "learning_rate": 7.505395603496562e-06, + "loss": 0.2058, + "num_tokens": 2854544525.0, + "step": 3743 + }, + { + "epoch": 5.10788089573576, + "grad_norm": 0.2203516031446968, + "learning_rate": 7.50081528955263e-06, + "loss": 0.2008, + "num_tokens": 2855364298.0, + "step": 3744 + }, + { + "epoch": 5.109246476694441, + "grad_norm": 0.22528566388532117, + "learning_rate": 7.496236043058414e-06, + "loss": 0.2048, + "num_tokens": 2856089749.0, + "step": 3745 + }, + { + "epoch": 5.110612057653121, + "grad_norm": 0.21776986766123949, + "learning_rate": 7.491657865410853e-06, + "loss": 0.2035, + "num_tokens": 2856825111.0, + "step": 3746 + }, + { + "epoch": 5.111977638611801, + "grad_norm": 0.22441463456951855, + "learning_rate": 7.4870807580065354e-06, + "loss": 0.2027, + "num_tokens": 2857597980.0, + "step": 3747 + }, + { + "epoch": 5.113343219570482, + "grad_norm": 0.2152520420262227, + "learning_rate": 7.4825047222417524e-06, + "loss": 0.2146, + "num_tokens": 2858340334.0, + "step": 3748 + }, + { + "epoch": 5.114708800529162, + "grad_norm": 0.2387304125560509, + "learning_rate": 7.477929759512448e-06, + "loss": 0.2114, + "num_tokens": 2859133115.0, + "step": 3749 + }, + { + "epoch": 5.116074381487843, + "grad_norm": 0.21248524117583617, + "learning_rate": 7.473355871214248e-06, + "loss": 0.2125, + "num_tokens": 2859931061.0, + "step": 3750 + }, + { + "epoch": 5.117439962446523, + "grad_norm": 0.22661845305761805, + "learning_rate": 7.4687830587424436e-06, + "loss": 0.1952, + "num_tokens": 2860671638.0, + "step": 3751 + }, + { + "epoch": 5.118805543405204, + "grad_norm": 0.21056618631875612, + "learning_rate": 7.4642113234920035e-06, + "loss": 0.2055, + "num_tokens": 2861466618.0, + "step": 3752 + }, + { + "epoch": 5.1201711243638846, + "grad_norm": 0.20036601439947804, + "learning_rate": 7.459640666857568e-06, + "loss": 0.2048, + "num_tokens": 2862204795.0, + "step": 3753 + }, + { + "epoch": 5.1215367053225656, + "grad_norm": 0.21565698999249916, + "learning_rate": 7.455071090233441e-06, + "loss": 0.1992, + "num_tokens": 2862941357.0, + "step": 3754 + }, + { + "epoch": 5.122902286281246, + "grad_norm": 0.20057741944365076, + "learning_rate": 7.450502595013611e-06, + "loss": 0.2057, + "num_tokens": 2863732177.0, + "step": 3755 + }, + { + "epoch": 5.124267867239926, + "grad_norm": 0.23608369377011024, + "learning_rate": 7.445935182591724e-06, + "loss": 0.2003, + "num_tokens": 2864558257.0, + "step": 3756 + }, + { + "epoch": 5.125633448198607, + "grad_norm": 0.2151746674087631, + "learning_rate": 7.441368854361095e-06, + "loss": 0.2129, + "num_tokens": 2865332519.0, + "step": 3757 + }, + { + "epoch": 5.126999029157287, + "grad_norm": 0.21374828683100594, + "learning_rate": 7.436803611714722e-06, + "loss": 0.2021, + "num_tokens": 2866081744.0, + "step": 3758 + }, + { + "epoch": 5.128364610115968, + "grad_norm": 0.21892295521800512, + "learning_rate": 7.432239456045254e-06, + "loss": 0.2, + "num_tokens": 2866771195.0, + "step": 3759 + }, + { + "epoch": 5.129730191074648, + "grad_norm": 0.23259196818141742, + "learning_rate": 7.427676388745027e-06, + "loss": 0.2107, + "num_tokens": 2867479050.0, + "step": 3760 + }, + { + "epoch": 5.131095772033329, + "grad_norm": 0.2200191554884494, + "learning_rate": 7.423114411206023e-06, + "loss": 0.2076, + "num_tokens": 2868244794.0, + "step": 3761 + }, + { + "epoch": 5.132461352992009, + "grad_norm": 0.2088709206351495, + "learning_rate": 7.418553524819916e-06, + "loss": 0.1979, + "num_tokens": 2869069644.0, + "step": 3762 + }, + { + "epoch": 5.13382693395069, + "grad_norm": 0.2253836309377385, + "learning_rate": 7.4139937309780245e-06, + "loss": 0.2016, + "num_tokens": 2869873024.0, + "step": 3763 + }, + { + "epoch": 5.13519251490937, + "grad_norm": 0.206326144224142, + "learning_rate": 7.40943503107135e-06, + "loss": 0.209, + "num_tokens": 2870678881.0, + "step": 3764 + }, + { + "epoch": 5.13655809586805, + "grad_norm": 0.22355355451112124, + "learning_rate": 7.4048774264905575e-06, + "loss": 0.1993, + "num_tokens": 2871524178.0, + "step": 3765 + }, + { + "epoch": 5.137923676826731, + "grad_norm": 0.1955354341137779, + "learning_rate": 7.4003209186259675e-06, + "loss": 0.1998, + "num_tokens": 2872279319.0, + "step": 3766 + }, + { + "epoch": 5.1392892577854115, + "grad_norm": 0.20926443675354034, + "learning_rate": 7.395765508867576e-06, + "loss": 0.2058, + "num_tokens": 2873073161.0, + "step": 3767 + }, + { + "epoch": 5.1406548387440925, + "grad_norm": 0.21712716331852347, + "learning_rate": 7.391211198605041e-06, + "loss": 0.1911, + "num_tokens": 2873833436.0, + "step": 3768 + }, + { + "epoch": 5.142020419702773, + "grad_norm": 0.23070246301849928, + "learning_rate": 7.38665798922769e-06, + "loss": 0.2116, + "num_tokens": 2874557440.0, + "step": 3769 + }, + { + "epoch": 5.143386000661454, + "grad_norm": 0.2181409977924548, + "learning_rate": 7.382105882124502e-06, + "loss": 0.2122, + "num_tokens": 2875323341.0, + "step": 3770 + }, + { + "epoch": 5.144751581620134, + "grad_norm": 0.22062018394722555, + "learning_rate": 7.377554878684132e-06, + "loss": 0.197, + "num_tokens": 2876016624.0, + "step": 3771 + }, + { + "epoch": 5.146117162578815, + "grad_norm": 0.21785043859602848, + "learning_rate": 7.373004980294898e-06, + "loss": 0.206, + "num_tokens": 2876723689.0, + "step": 3772 + }, + { + "epoch": 5.147482743537495, + "grad_norm": 0.22602512627801633, + "learning_rate": 7.368456188344766e-06, + "loss": 0.1919, + "num_tokens": 2877452052.0, + "step": 3773 + }, + { + "epoch": 5.148848324496175, + "grad_norm": 0.22340938741779331, + "learning_rate": 7.3639085042213884e-06, + "loss": 0.2106, + "num_tokens": 2878273742.0, + "step": 3774 + }, + { + "epoch": 5.150213905454856, + "grad_norm": 0.196962586869315, + "learning_rate": 7.359361929312054e-06, + "loss": 0.2097, + "num_tokens": 2879075204.0, + "step": 3775 + }, + { + "epoch": 5.151579486413536, + "grad_norm": 0.22802121366441894, + "learning_rate": 7.354816465003738e-06, + "loss": 0.1978, + "num_tokens": 2879853777.0, + "step": 3776 + }, + { + "epoch": 5.152945067372217, + "grad_norm": 0.2193585328732095, + "learning_rate": 7.350272112683057e-06, + "loss": 0.2003, + "num_tokens": 2880626968.0, + "step": 3777 + }, + { + "epoch": 5.154310648330897, + "grad_norm": 0.22210305684423354, + "learning_rate": 7.345728873736298e-06, + "loss": 0.2046, + "num_tokens": 2881373668.0, + "step": 3778 + }, + { + "epoch": 5.155676229289578, + "grad_norm": 0.22008201039739372, + "learning_rate": 7.341186749549407e-06, + "loss": 0.2097, + "num_tokens": 2882119236.0, + "step": 3779 + }, + { + "epoch": 5.157041810248258, + "grad_norm": 0.21106241298947118, + "learning_rate": 7.3366457415079885e-06, + "loss": 0.1989, + "num_tokens": 2882914376.0, + "step": 3780 + }, + { + "epoch": 5.158407391206939, + "grad_norm": 0.2171274902175172, + "learning_rate": 7.332105850997309e-06, + "loss": 0.2078, + "num_tokens": 2883704652.0, + "step": 3781 + }, + { + "epoch": 5.159772972165619, + "grad_norm": 0.20068729354325243, + "learning_rate": 7.327567079402287e-06, + "loss": 0.202, + "num_tokens": 2884510638.0, + "step": 3782 + }, + { + "epoch": 5.1611385531242995, + "grad_norm": 0.20897623155718253, + "learning_rate": 7.323029428107515e-06, + "loss": 0.1935, + "num_tokens": 2885284587.0, + "step": 3783 + }, + { + "epoch": 5.1625041340829805, + "grad_norm": 0.20352511808835652, + "learning_rate": 7.318492898497225e-06, + "loss": 0.1994, + "num_tokens": 2886097012.0, + "step": 3784 + }, + { + "epoch": 5.163869715041661, + "grad_norm": 0.2240623004510108, + "learning_rate": 7.313957491955322e-06, + "loss": 0.2091, + "num_tokens": 2886829757.0, + "step": 3785 + }, + { + "epoch": 5.165235296000342, + "grad_norm": 0.22570518571551393, + "learning_rate": 7.3094232098653565e-06, + "loss": 0.2094, + "num_tokens": 2887610830.0, + "step": 3786 + }, + { + "epoch": 5.166600876959022, + "grad_norm": 0.21497929877471925, + "learning_rate": 7.304890053610547e-06, + "loss": 0.1992, + "num_tokens": 2888281944.0, + "step": 3787 + }, + { + "epoch": 5.167966457917703, + "grad_norm": 0.21377304334486458, + "learning_rate": 7.30035802457376e-06, + "loss": 0.2132, + "num_tokens": 2889179974.0, + "step": 3788 + }, + { + "epoch": 5.169332038876383, + "grad_norm": 0.22426007325273972, + "learning_rate": 7.29582712413752e-06, + "loss": 0.2013, + "num_tokens": 2889957121.0, + "step": 3789 + }, + { + "epoch": 5.170697619835064, + "grad_norm": 0.2241249978544526, + "learning_rate": 7.291297353684013e-06, + "loss": 0.1962, + "num_tokens": 2890692868.0, + "step": 3790 + }, + { + "epoch": 5.172063200793744, + "grad_norm": 0.2187048842876811, + "learning_rate": 7.2867687145950695e-06, + "loss": 0.2134, + "num_tokens": 2891503383.0, + "step": 3791 + }, + { + "epoch": 5.173428781752424, + "grad_norm": 0.22639862141911835, + "learning_rate": 7.282241208252185e-06, + "loss": 0.2, + "num_tokens": 2892299751.0, + "step": 3792 + }, + { + "epoch": 5.174794362711105, + "grad_norm": 0.22848740792986263, + "learning_rate": 7.277714836036507e-06, + "loss": 0.2114, + "num_tokens": 2893042581.0, + "step": 3793 + }, + { + "epoch": 5.176159943669785, + "grad_norm": 0.22077816366746142, + "learning_rate": 7.273189599328836e-06, + "loss": 0.2006, + "num_tokens": 2893673349.0, + "step": 3794 + }, + { + "epoch": 5.177525524628466, + "grad_norm": 0.2556511709358179, + "learning_rate": 7.268665499509624e-06, + "loss": 0.2054, + "num_tokens": 2894460176.0, + "step": 3795 + }, + { + "epoch": 5.178891105587146, + "grad_norm": 0.22054666116471505, + "learning_rate": 7.2641425379589735e-06, + "loss": 0.208, + "num_tokens": 2895186534.0, + "step": 3796 + }, + { + "epoch": 5.180256686545827, + "grad_norm": 0.22161928409977202, + "learning_rate": 7.259620716056652e-06, + "loss": 0.2097, + "num_tokens": 2895975883.0, + "step": 3797 + }, + { + "epoch": 5.181622267504507, + "grad_norm": 0.21765944734530993, + "learning_rate": 7.255100035182068e-06, + "loss": 0.1996, + "num_tokens": 2896692568.0, + "step": 3798 + }, + { + "epoch": 5.182987848463188, + "grad_norm": 0.2107155905680877, + "learning_rate": 7.250580496714281e-06, + "loss": 0.2026, + "num_tokens": 2897458690.0, + "step": 3799 + }, + { + "epoch": 5.1843534294218685, + "grad_norm": 0.22710858803140827, + "learning_rate": 7.246062102032014e-06, + "loss": 0.2094, + "num_tokens": 2898234091.0, + "step": 3800 + }, + { + "epoch": 5.185719010380549, + "grad_norm": 0.20287614195019252, + "learning_rate": 7.241544852513634e-06, + "loss": 0.1985, + "num_tokens": 2899010777.0, + "step": 3801 + }, + { + "epoch": 5.18708459133923, + "grad_norm": 0.21186611234250663, + "learning_rate": 7.237028749537145e-06, + "loss": 0.1963, + "num_tokens": 2899760328.0, + "step": 3802 + }, + { + "epoch": 5.18845017229791, + "grad_norm": 0.22557872055123027, + "learning_rate": 7.2325137944802334e-06, + "loss": 0.1951, + "num_tokens": 2900535013.0, + "step": 3803 + }, + { + "epoch": 5.189815753256591, + "grad_norm": 0.20604432958115748, + "learning_rate": 7.227999988720207e-06, + "loss": 0.1976, + "num_tokens": 2901259567.0, + "step": 3804 + }, + { + "epoch": 5.191181334215271, + "grad_norm": 0.21384251940926238, + "learning_rate": 7.223487333634025e-06, + "loss": 0.1973, + "num_tokens": 2902039980.0, + "step": 3805 + }, + { + "epoch": 5.192546915173952, + "grad_norm": 0.2768980835830079, + "learning_rate": 7.218975830598318e-06, + "loss": 0.2043, + "num_tokens": 2902888144.0, + "step": 3806 + }, + { + "epoch": 5.193912496132632, + "grad_norm": 0.19696957616131214, + "learning_rate": 7.214465480989339e-06, + "loss": 0.1971, + "num_tokens": 2903800533.0, + "step": 3807 + }, + { + "epoch": 5.195278077091313, + "grad_norm": 0.2058465700323115, + "learning_rate": 7.209956286183006e-06, + "loss": 0.2165, + "num_tokens": 2904650321.0, + "step": 3808 + }, + { + "epoch": 5.196643658049993, + "grad_norm": 0.2229418133284332, + "learning_rate": 7.205448247554876e-06, + "loss": 0.1992, + "num_tokens": 2905378165.0, + "step": 3809 + }, + { + "epoch": 5.198009239008673, + "grad_norm": 0.20828585528976826, + "learning_rate": 7.200941366480159e-06, + "loss": 0.2018, + "num_tokens": 2906165429.0, + "step": 3810 + }, + { + "epoch": 5.199374819967354, + "grad_norm": 0.21540492539970196, + "learning_rate": 7.196435644333708e-06, + "loss": 0.1966, + "num_tokens": 2906881168.0, + "step": 3811 + }, + { + "epoch": 5.200740400926034, + "grad_norm": 0.21940943135553798, + "learning_rate": 7.19193108249002e-06, + "loss": 0.2118, + "num_tokens": 2907700720.0, + "step": 3812 + }, + { + "epoch": 5.202105981884715, + "grad_norm": 0.20710881949041968, + "learning_rate": 7.187427682323252e-06, + "loss": 0.2004, + "num_tokens": 2908502238.0, + "step": 3813 + }, + { + "epoch": 5.203471562843395, + "grad_norm": 0.20536153745260324, + "learning_rate": 7.18292544520718e-06, + "loss": 0.2014, + "num_tokens": 2909205202.0, + "step": 3814 + }, + { + "epoch": 5.204837143802076, + "grad_norm": 0.22094981709510114, + "learning_rate": 7.17842437251526e-06, + "loss": 0.2054, + "num_tokens": 2909930358.0, + "step": 3815 + }, + { + "epoch": 5.2062027247607565, + "grad_norm": 0.2261930284649632, + "learning_rate": 7.173924465620561e-06, + "loss": 0.2059, + "num_tokens": 2910651026.0, + "step": 3816 + }, + { + "epoch": 5.2075683057194375, + "grad_norm": 0.20799316690038727, + "learning_rate": 7.169425725895816e-06, + "loss": 0.2042, + "num_tokens": 2911457416.0, + "step": 3817 + }, + { + "epoch": 5.208933886678118, + "grad_norm": 0.23312664792829438, + "learning_rate": 7.164928154713395e-06, + "loss": 0.2065, + "num_tokens": 2912175709.0, + "step": 3818 + }, + { + "epoch": 5.210299467636798, + "grad_norm": 0.2191041810039405, + "learning_rate": 7.160431753445309e-06, + "loss": 0.217, + "num_tokens": 2912991295.0, + "step": 3819 + }, + { + "epoch": 5.211665048595479, + "grad_norm": 0.2257661907210269, + "learning_rate": 7.155936523463216e-06, + "loss": 0.1978, + "num_tokens": 2913724781.0, + "step": 3820 + }, + { + "epoch": 5.213030629554159, + "grad_norm": 0.2782668934721485, + "learning_rate": 7.151442466138418e-06, + "loss": 0.2017, + "num_tokens": 2914462084.0, + "step": 3821 + }, + { + "epoch": 5.21439621051284, + "grad_norm": 0.22721174553469378, + "learning_rate": 7.1469495828418624e-06, + "loss": 0.1937, + "num_tokens": 2915148584.0, + "step": 3822 + }, + { + "epoch": 5.21576179147152, + "grad_norm": 0.22763006882644274, + "learning_rate": 7.14245787494412e-06, + "loss": 0.2166, + "num_tokens": 2915892592.0, + "step": 3823 + }, + { + "epoch": 5.217127372430201, + "grad_norm": 0.2290755944527407, + "learning_rate": 7.137967343815431e-06, + "loss": 0.1972, + "num_tokens": 2916648004.0, + "step": 3824 + }, + { + "epoch": 5.218492953388881, + "grad_norm": 0.23173571289003456, + "learning_rate": 7.13347799082565e-06, + "loss": 0.1975, + "num_tokens": 2917369755.0, + "step": 3825 + }, + { + "epoch": 5.219858534347562, + "grad_norm": 0.2582734958833463, + "learning_rate": 7.12898981734429e-06, + "loss": 0.204, + "num_tokens": 2918139154.0, + "step": 3826 + }, + { + "epoch": 5.221224115306242, + "grad_norm": 0.202642484460165, + "learning_rate": 7.1245028247405e-06, + "loss": 0.2081, + "num_tokens": 2918883890.0, + "step": 3827 + }, + { + "epoch": 5.222589696264922, + "grad_norm": 0.2280625850878707, + "learning_rate": 7.120017014383063e-06, + "loss": 0.2012, + "num_tokens": 2919586829.0, + "step": 3828 + }, + { + "epoch": 5.223955277223603, + "grad_norm": 0.2378131251491234, + "learning_rate": 7.115532387640407e-06, + "loss": 0.2181, + "num_tokens": 2920327593.0, + "step": 3829 + }, + { + "epoch": 5.225320858182283, + "grad_norm": 0.2349497431338002, + "learning_rate": 7.111048945880597e-06, + "loss": 0.2127, + "num_tokens": 2921118664.0, + "step": 3830 + }, + { + "epoch": 5.226686439140964, + "grad_norm": 0.2068678242736546, + "learning_rate": 7.106566690471339e-06, + "loss": 0.2006, + "num_tokens": 2921833209.0, + "step": 3831 + }, + { + "epoch": 5.2280520200996445, + "grad_norm": 0.25917666778670595, + "learning_rate": 7.102085622779968e-06, + "loss": 0.2019, + "num_tokens": 2922624987.0, + "step": 3832 + }, + { + "epoch": 5.2294176010583255, + "grad_norm": 0.23533089684929226, + "learning_rate": 7.097605744173477e-06, + "loss": 0.2053, + "num_tokens": 2923419893.0, + "step": 3833 + }, + { + "epoch": 5.230783182017006, + "grad_norm": 0.2187524200542421, + "learning_rate": 7.093127056018472e-06, + "loss": 0.1993, + "num_tokens": 2924142121.0, + "step": 3834 + }, + { + "epoch": 5.232148762975687, + "grad_norm": 0.20805288633951538, + "learning_rate": 7.0886495596812085e-06, + "loss": 0.2094, + "num_tokens": 2924989374.0, + "step": 3835 + }, + { + "epoch": 5.233514343934367, + "grad_norm": 0.2075164908327263, + "learning_rate": 7.084173256527582e-06, + "loss": 0.2039, + "num_tokens": 2925790629.0, + "step": 3836 + }, + { + "epoch": 5.234879924893047, + "grad_norm": 0.21627954290217682, + "learning_rate": 7.079698147923111e-06, + "loss": 0.2142, + "num_tokens": 2926646421.0, + "step": 3837 + }, + { + "epoch": 5.236245505851728, + "grad_norm": 0.3231215570745274, + "learning_rate": 7.075224235232962e-06, + "loss": 0.2122, + "num_tokens": 2927405311.0, + "step": 3838 + }, + { + "epoch": 5.237611086810408, + "grad_norm": 0.22151428392172745, + "learning_rate": 7.070751519821932e-06, + "loss": 0.2049, + "num_tokens": 2928146186.0, + "step": 3839 + }, + { + "epoch": 5.238976667769089, + "grad_norm": 0.22128398675141975, + "learning_rate": 7.066280003054456e-06, + "loss": 0.2094, + "num_tokens": 2928999750.0, + "step": 3840 + }, + { + "epoch": 5.240342248727769, + "grad_norm": 0.20945109053100924, + "learning_rate": 7.061809686294585e-06, + "loss": 0.215, + "num_tokens": 2929786118.0, + "step": 3841 + }, + { + "epoch": 5.24170782968645, + "grad_norm": 0.21076573069011556, + "learning_rate": 7.0573405709060396e-06, + "loss": 0.1947, + "num_tokens": 2930540256.0, + "step": 3842 + }, + { + "epoch": 5.24307341064513, + "grad_norm": 0.21235474573283647, + "learning_rate": 7.052872658252141e-06, + "loss": 0.2023, + "num_tokens": 2931309753.0, + "step": 3843 + }, + { + "epoch": 5.244438991603811, + "grad_norm": 0.20724810815082806, + "learning_rate": 7.0484059496958575e-06, + "loss": 0.2062, + "num_tokens": 2932054402.0, + "step": 3844 + }, + { + "epoch": 5.245804572562491, + "grad_norm": 0.23829628577306675, + "learning_rate": 7.043940446599795e-06, + "loss": 0.209, + "num_tokens": 2932769018.0, + "step": 3845 + }, + { + "epoch": 5.247170153521171, + "grad_norm": 0.21781084827279307, + "learning_rate": 7.0394761503261765e-06, + "loss": 0.2017, + "num_tokens": 2933567115.0, + "step": 3846 + }, + { + "epoch": 5.248535734479852, + "grad_norm": 0.20943378442156013, + "learning_rate": 7.0350130622368666e-06, + "loss": 0.2026, + "num_tokens": 2934334057.0, + "step": 3847 + }, + { + "epoch": 5.249901315438533, + "grad_norm": 0.2143704922132957, + "learning_rate": 7.030551183693364e-06, + "loss": 0.2041, + "num_tokens": 2935128766.0, + "step": 3848 + }, + { + "epoch": 5.251266896397214, + "grad_norm": 0.2214658857179839, + "learning_rate": 7.026090516056798e-06, + "loss": 0.2032, + "num_tokens": 2935873209.0, + "step": 3849 + }, + { + "epoch": 5.252632477355894, + "grad_norm": 0.20422754477118435, + "learning_rate": 7.021631060687915e-06, + "loss": 0.1981, + "num_tokens": 2936628137.0, + "step": 3850 + }, + { + "epoch": 5.253998058314575, + "grad_norm": 0.21141180023045134, + "learning_rate": 7.017172818947105e-06, + "loss": 0.2016, + "num_tokens": 2937402047.0, + "step": 3851 + }, + { + "epoch": 5.255363639273255, + "grad_norm": 0.2053930901423065, + "learning_rate": 7.012715792194389e-06, + "loss": 0.2104, + "num_tokens": 2938163175.0, + "step": 3852 + }, + { + "epoch": 5.256729220231936, + "grad_norm": 0.23061720964938742, + "learning_rate": 7.008259981789409e-06, + "loss": 0.2025, + "num_tokens": 2938919984.0, + "step": 3853 + }, + { + "epoch": 5.258094801190616, + "grad_norm": 0.2130794877185142, + "learning_rate": 7.0038053890914445e-06, + "loss": 0.2016, + "num_tokens": 2939690312.0, + "step": 3854 + }, + { + "epoch": 5.259460382149296, + "grad_norm": 0.20842503646885813, + "learning_rate": 6.999352015459392e-06, + "loss": 0.1911, + "num_tokens": 2940500635.0, + "step": 3855 + }, + { + "epoch": 5.260825963107977, + "grad_norm": 0.206622847402675, + "learning_rate": 6.994899862251788e-06, + "loss": 0.2054, + "num_tokens": 2941266229.0, + "step": 3856 + }, + { + "epoch": 5.262191544066657, + "grad_norm": 0.20370694567723946, + "learning_rate": 6.9904489308267874e-06, + "loss": 0.2075, + "num_tokens": 2942046011.0, + "step": 3857 + }, + { + "epoch": 5.263557125025338, + "grad_norm": 0.21224502972670442, + "learning_rate": 6.98599922254218e-06, + "loss": 0.2032, + "num_tokens": 2942822616.0, + "step": 3858 + }, + { + "epoch": 5.264922705984018, + "grad_norm": 0.20986512126196755, + "learning_rate": 6.981550738755382e-06, + "loss": 0.2113, + "num_tokens": 2943726274.0, + "step": 3859 + }, + { + "epoch": 5.266288286942699, + "grad_norm": 0.22037342963510634, + "learning_rate": 6.9771034808234266e-06, + "loss": 0.2049, + "num_tokens": 2944519120.0, + "step": 3860 + }, + { + "epoch": 5.267653867901379, + "grad_norm": 0.2226433663988552, + "learning_rate": 6.9726574501029795e-06, + "loss": 0.2112, + "num_tokens": 2945287946.0, + "step": 3861 + }, + { + "epoch": 5.26901944886006, + "grad_norm": 0.21981291641226347, + "learning_rate": 6.968212647950337e-06, + "loss": 0.2031, + "num_tokens": 2946050946.0, + "step": 3862 + }, + { + "epoch": 5.2703850298187405, + "grad_norm": 0.22150686823442328, + "learning_rate": 6.963769075721418e-06, + "loss": 0.213, + "num_tokens": 2946838789.0, + "step": 3863 + }, + { + "epoch": 5.271750610777421, + "grad_norm": 0.2030750945912966, + "learning_rate": 6.959326734771754e-06, + "loss": 0.1976, + "num_tokens": 2947594649.0, + "step": 3864 + }, + { + "epoch": 5.273116191736102, + "grad_norm": 0.21730482994177372, + "learning_rate": 6.954885626456519e-06, + "loss": 0.2086, + "num_tokens": 2948385309.0, + "step": 3865 + }, + { + "epoch": 5.274481772694782, + "grad_norm": 0.21559735540238867, + "learning_rate": 6.950445752130501e-06, + "loss": 0.2037, + "num_tokens": 2949147582.0, + "step": 3866 + }, + { + "epoch": 5.275847353653463, + "grad_norm": 0.22463654666467894, + "learning_rate": 6.9460071131481096e-06, + "loss": 0.2, + "num_tokens": 2949926277.0, + "step": 3867 + }, + { + "epoch": 5.277212934612143, + "grad_norm": 0.2086699971850183, + "learning_rate": 6.941569710863389e-06, + "loss": 0.2133, + "num_tokens": 2950730013.0, + "step": 3868 + }, + { + "epoch": 5.278578515570824, + "grad_norm": 0.22295150774548883, + "learning_rate": 6.9371335466299905e-06, + "loss": 0.2082, + "num_tokens": 2951485273.0, + "step": 3869 + }, + { + "epoch": 5.279944096529504, + "grad_norm": 0.22122049514469388, + "learning_rate": 6.9326986218012e-06, + "loss": 0.2051, + "num_tokens": 2952191219.0, + "step": 3870 + }, + { + "epoch": 5.281309677488185, + "grad_norm": 0.2224592409572515, + "learning_rate": 6.928264937729919e-06, + "loss": 0.2174, + "num_tokens": 2952952636.0, + "step": 3871 + }, + { + "epoch": 5.282675258446865, + "grad_norm": 0.23104874395581698, + "learning_rate": 6.923832495768678e-06, + "loss": 0.2103, + "num_tokens": 2953693609.0, + "step": 3872 + }, + { + "epoch": 5.284040839405545, + "grad_norm": 0.221508196171485, + "learning_rate": 6.919401297269614e-06, + "loss": 0.2137, + "num_tokens": 2954524429.0, + "step": 3873 + }, + { + "epoch": 5.285406420364226, + "grad_norm": 0.2006615646744375, + "learning_rate": 6.9149713435845e-06, + "loss": 0.1958, + "num_tokens": 2955351525.0, + "step": 3874 + }, + { + "epoch": 5.286772001322906, + "grad_norm": 0.20664850070294355, + "learning_rate": 6.910542636064725e-06, + "loss": 0.2095, + "num_tokens": 2956084086.0, + "step": 3875 + }, + { + "epoch": 5.288137582281587, + "grad_norm": 0.22144603501606874, + "learning_rate": 6.9061151760612835e-06, + "loss": 0.2071, + "num_tokens": 2956835961.0, + "step": 3876 + }, + { + "epoch": 5.289503163240267, + "grad_norm": 0.21793817391935666, + "learning_rate": 6.90168896492482e-06, + "loss": 0.2019, + "num_tokens": 2957563220.0, + "step": 3877 + }, + { + "epoch": 5.290868744198948, + "grad_norm": 0.21957027969317294, + "learning_rate": 6.897264004005566e-06, + "loss": 0.2152, + "num_tokens": 2958364073.0, + "step": 3878 + }, + { + "epoch": 5.2922343251576285, + "grad_norm": 0.21423368784299876, + "learning_rate": 6.89284029465339e-06, + "loss": 0.2007, + "num_tokens": 2959060095.0, + "step": 3879 + }, + { + "epoch": 5.2935999061163095, + "grad_norm": 0.2164059030161996, + "learning_rate": 6.8884178382177735e-06, + "loss": 0.1991, + "num_tokens": 2959773835.0, + "step": 3880 + }, + { + "epoch": 5.29496548707499, + "grad_norm": 0.21749932340217, + "learning_rate": 6.8839966360478215e-06, + "loss": 0.2066, + "num_tokens": 2960500556.0, + "step": 3881 + }, + { + "epoch": 5.29633106803367, + "grad_norm": 0.2134249990529955, + "learning_rate": 6.879576689492244e-06, + "loss": 0.1984, + "num_tokens": 2961237474.0, + "step": 3882 + }, + { + "epoch": 5.297696648992351, + "grad_norm": 0.2119816066679417, + "learning_rate": 6.8751579998993775e-06, + "loss": 0.2117, + "num_tokens": 2962054518.0, + "step": 3883 + }, + { + "epoch": 5.299062229951031, + "grad_norm": 0.21123354634863953, + "learning_rate": 6.870740568617177e-06, + "loss": 0.2079, + "num_tokens": 2962898403.0, + "step": 3884 + }, + { + "epoch": 5.300427810909712, + "grad_norm": 0.22247465086742244, + "learning_rate": 6.8663243969931984e-06, + "loss": 0.2101, + "num_tokens": 2963694779.0, + "step": 3885 + }, + { + "epoch": 5.301793391868392, + "grad_norm": 0.20205372907774188, + "learning_rate": 6.8619094863746395e-06, + "loss": 0.2032, + "num_tokens": 2964479884.0, + "step": 3886 + }, + { + "epoch": 5.303158972827073, + "grad_norm": 0.21908163753243928, + "learning_rate": 6.857495838108288e-06, + "loss": 0.198, + "num_tokens": 2965260789.0, + "step": 3887 + }, + { + "epoch": 5.304524553785753, + "grad_norm": 0.2287376333976934, + "learning_rate": 6.853083453540557e-06, + "loss": 0.2089, + "num_tokens": 2966052441.0, + "step": 3888 + }, + { + "epoch": 5.305890134744434, + "grad_norm": 0.21353855195920252, + "learning_rate": 6.848672334017484e-06, + "loss": 0.2048, + "num_tokens": 2966838727.0, + "step": 3889 + }, + { + "epoch": 5.307255715703114, + "grad_norm": 0.21638175902077988, + "learning_rate": 6.844262480884698e-06, + "loss": 0.207, + "num_tokens": 2967651690.0, + "step": 3890 + }, + { + "epoch": 5.308621296661794, + "grad_norm": 0.20958988449824822, + "learning_rate": 6.839853895487459e-06, + "loss": 0.2091, + "num_tokens": 2968414422.0, + "step": 3891 + }, + { + "epoch": 5.309986877620475, + "grad_norm": 0.20986597733984208, + "learning_rate": 6.835446579170635e-06, + "loss": 0.2044, + "num_tokens": 2969132221.0, + "step": 3892 + }, + { + "epoch": 5.311352458579155, + "grad_norm": 0.23463757746358527, + "learning_rate": 6.831040533278712e-06, + "loss": 0.2006, + "num_tokens": 2969875938.0, + "step": 3893 + }, + { + "epoch": 5.312718039537836, + "grad_norm": 0.21107520089488901, + "learning_rate": 6.826635759155775e-06, + "loss": 0.2021, + "num_tokens": 2970621325.0, + "step": 3894 + }, + { + "epoch": 5.3140836204965165, + "grad_norm": 0.23263860847824042, + "learning_rate": 6.82223225814554e-06, + "loss": 0.2084, + "num_tokens": 2971298137.0, + "step": 3895 + }, + { + "epoch": 5.3154492014551975, + "grad_norm": 0.20749241828523024, + "learning_rate": 6.817830031591316e-06, + "loss": 0.2038, + "num_tokens": 2972063341.0, + "step": 3896 + }, + { + "epoch": 5.316814782413878, + "grad_norm": 0.229656857850978, + "learning_rate": 6.813429080836037e-06, + "loss": 0.2135, + "num_tokens": 2972838274.0, + "step": 3897 + }, + { + "epoch": 5.318180363372559, + "grad_norm": 0.2266837868976853, + "learning_rate": 6.809029407222242e-06, + "loss": 0.1999, + "num_tokens": 2973615333.0, + "step": 3898 + }, + { + "epoch": 5.319545944331239, + "grad_norm": 0.22065870763076495, + "learning_rate": 6.804631012092075e-06, + "loss": 0.2159, + "num_tokens": 2974414900.0, + "step": 3899 + }, + { + "epoch": 5.320911525289919, + "grad_norm": 0.19775343079142108, + "learning_rate": 6.8002338967873026e-06, + "loss": 0.211, + "num_tokens": 2975235244.0, + "step": 3900 + }, + { + "epoch": 5.3222771062486, + "grad_norm": 0.23786819160271633, + "learning_rate": 6.79583806264929e-06, + "loss": 0.1962, + "num_tokens": 2975928748.0, + "step": 3901 + }, + { + "epoch": 5.32364268720728, + "grad_norm": 0.21134203931635603, + "learning_rate": 6.7914435110190225e-06, + "loss": 0.2116, + "num_tokens": 2976715939.0, + "step": 3902 + }, + { + "epoch": 5.325008268165961, + "grad_norm": 0.20160089950672844, + "learning_rate": 6.787050243237076e-06, + "loss": 0.2047, + "num_tokens": 2977560098.0, + "step": 3903 + }, + { + "epoch": 5.326373849124641, + "grad_norm": 0.22908234980062678, + "learning_rate": 6.782658260643658e-06, + "loss": 0.2033, + "num_tokens": 2978261356.0, + "step": 3904 + }, + { + "epoch": 5.327739430083322, + "grad_norm": 0.23161987750464236, + "learning_rate": 6.778267564578569e-06, + "loss": 0.2126, + "num_tokens": 2979037412.0, + "step": 3905 + }, + { + "epoch": 5.329105011042002, + "grad_norm": 0.2305438749698866, + "learning_rate": 6.773878156381211e-06, + "loss": 0.2144, + "num_tokens": 2979814603.0, + "step": 3906 + }, + { + "epoch": 5.330470592000683, + "grad_norm": 0.2081562066004125, + "learning_rate": 6.7694900373906155e-06, + "loss": 0.2012, + "num_tokens": 2980564647.0, + "step": 3907 + }, + { + "epoch": 5.331836172959363, + "grad_norm": 0.22192673974876678, + "learning_rate": 6.7651032089453985e-06, + "loss": 0.2054, + "num_tokens": 2981310300.0, + "step": 3908 + }, + { + "epoch": 5.333201753918043, + "grad_norm": 0.2176542188437182, + "learning_rate": 6.760717672383795e-06, + "loss": 0.209, + "num_tokens": 2982033185.0, + "step": 3909 + }, + { + "epoch": 5.334567334876724, + "grad_norm": 0.22525557621780348, + "learning_rate": 6.7563334290436426e-06, + "loss": 0.2028, + "num_tokens": 2982848474.0, + "step": 3910 + }, + { + "epoch": 5.3359329158354045, + "grad_norm": 0.20635915928303075, + "learning_rate": 6.751950480262386e-06, + "loss": 0.2155, + "num_tokens": 2983653628.0, + "step": 3911 + }, + { + "epoch": 5.3372984967940855, + "grad_norm": 0.2154540681791433, + "learning_rate": 6.747568827377069e-06, + "loss": 0.2016, + "num_tokens": 2984416772.0, + "step": 3912 + }, + { + "epoch": 5.338664077752766, + "grad_norm": 0.22306836992650342, + "learning_rate": 6.743188471724343e-06, + "loss": 0.1951, + "num_tokens": 2985157701.0, + "step": 3913 + }, + { + "epoch": 5.340029658711447, + "grad_norm": 0.20611683063501834, + "learning_rate": 6.738809414640468e-06, + "loss": 0.2141, + "num_tokens": 2985898060.0, + "step": 3914 + }, + { + "epoch": 5.341395239670127, + "grad_norm": 0.2187750564732513, + "learning_rate": 6.734431657461307e-06, + "loss": 0.2027, + "num_tokens": 2986605338.0, + "step": 3915 + }, + { + "epoch": 5.342760820628808, + "grad_norm": 0.20961506531679483, + "learning_rate": 6.730055201522321e-06, + "loss": 0.1981, + "num_tokens": 2987388621.0, + "step": 3916 + }, + { + "epoch": 5.344126401587488, + "grad_norm": 0.20592194707910752, + "learning_rate": 6.725680048158576e-06, + "loss": 0.2087, + "num_tokens": 2988145617.0, + "step": 3917 + }, + { + "epoch": 5.345491982546168, + "grad_norm": 0.23028862312316617, + "learning_rate": 6.721306198704741e-06, + "loss": 0.2148, + "num_tokens": 2988849071.0, + "step": 3918 + }, + { + "epoch": 5.346857563504849, + "grad_norm": 0.21600451560393882, + "learning_rate": 6.716933654495093e-06, + "loss": 0.2118, + "num_tokens": 2989698036.0, + "step": 3919 + }, + { + "epoch": 5.348223144463529, + "grad_norm": 0.21066639382721805, + "learning_rate": 6.712562416863506e-06, + "loss": 0.2059, + "num_tokens": 2990416949.0, + "step": 3920 + }, + { + "epoch": 5.34958872542221, + "grad_norm": 0.22177766642982763, + "learning_rate": 6.7081924871434475e-06, + "loss": 0.1991, + "num_tokens": 2991073570.0, + "step": 3921 + }, + { + "epoch": 5.35095430638089, + "grad_norm": 0.22918639932125587, + "learning_rate": 6.703823866668001e-06, + "loss": 0.1996, + "num_tokens": 2991738334.0, + "step": 3922 + }, + { + "epoch": 5.352319887339571, + "grad_norm": 0.22809827484483122, + "learning_rate": 6.699456556769839e-06, + "loss": 0.2065, + "num_tokens": 2992531767.0, + "step": 3923 + }, + { + "epoch": 5.353685468298251, + "grad_norm": 0.22637273925794932, + "learning_rate": 6.695090558781242e-06, + "loss": 0.2029, + "num_tokens": 2993317909.0, + "step": 3924 + }, + { + "epoch": 5.355051049256932, + "grad_norm": 0.20931756894604284, + "learning_rate": 6.6907258740340895e-06, + "loss": 0.198, + "num_tokens": 2994073804.0, + "step": 3925 + }, + { + "epoch": 5.356416630215612, + "grad_norm": 0.21642887573944217, + "learning_rate": 6.68636250385985e-06, + "loss": 0.2052, + "num_tokens": 2994838006.0, + "step": 3926 + }, + { + "epoch": 5.3577822111742925, + "grad_norm": 0.20364415051136914, + "learning_rate": 6.682000449589603e-06, + "loss": 0.2116, + "num_tokens": 2995652277.0, + "step": 3927 + }, + { + "epoch": 5.3591477921329735, + "grad_norm": 0.21857818864510942, + "learning_rate": 6.677639712554027e-06, + "loss": 0.2099, + "num_tokens": 2996398720.0, + "step": 3928 + }, + { + "epoch": 5.360513373091654, + "grad_norm": 0.2314345631279765, + "learning_rate": 6.673280294083382e-06, + "loss": 0.2071, + "num_tokens": 2997131561.0, + "step": 3929 + }, + { + "epoch": 5.361878954050335, + "grad_norm": 0.20131179303858668, + "learning_rate": 6.668922195507548e-06, + "loss": 0.2072, + "num_tokens": 2997934342.0, + "step": 3930 + }, + { + "epoch": 5.363244535009015, + "grad_norm": 0.233263609568893, + "learning_rate": 6.664565418155989e-06, + "loss": 0.2091, + "num_tokens": 2998687850.0, + "step": 3931 + }, + { + "epoch": 5.364610115967696, + "grad_norm": 0.20367509243928286, + "learning_rate": 6.660209963357768e-06, + "loss": 0.2033, + "num_tokens": 2999417879.0, + "step": 3932 + }, + { + "epoch": 5.365975696926376, + "grad_norm": 0.22285521247094325, + "learning_rate": 6.655855832441549e-06, + "loss": 0.2075, + "num_tokens": 3000213010.0, + "step": 3933 + }, + { + "epoch": 5.367341277885057, + "grad_norm": 0.20587081710308405, + "learning_rate": 6.651503026735588e-06, + "loss": 0.2, + "num_tokens": 3000940295.0, + "step": 3934 + }, + { + "epoch": 5.368706858843737, + "grad_norm": 0.2268081529679454, + "learning_rate": 6.647151547567734e-06, + "loss": 0.2056, + "num_tokens": 3001682320.0, + "step": 3935 + }, + { + "epoch": 5.370072439802417, + "grad_norm": 0.21997545141581412, + "learning_rate": 6.6428013962654344e-06, + "loss": 0.2154, + "num_tokens": 3002515148.0, + "step": 3936 + }, + { + "epoch": 5.371438020761098, + "grad_norm": 0.22183626037955562, + "learning_rate": 6.638452574155741e-06, + "loss": 0.2116, + "num_tokens": 3003256720.0, + "step": 3937 + }, + { + "epoch": 5.372803601719778, + "grad_norm": 0.20952289743097108, + "learning_rate": 6.6341050825652795e-06, + "loss": 0.1984, + "num_tokens": 3003989555.0, + "step": 3938 + }, + { + "epoch": 5.374169182678459, + "grad_norm": 0.2572710191439589, + "learning_rate": 6.629758922820288e-06, + "loss": 0.2059, + "num_tokens": 3004778789.0, + "step": 3939 + }, + { + "epoch": 5.375534763637139, + "grad_norm": 0.2227413166976771, + "learning_rate": 6.6254140962465895e-06, + "loss": 0.2083, + "num_tokens": 3005603740.0, + "step": 3940 + }, + { + "epoch": 5.37690034459582, + "grad_norm": 0.2046889864532848, + "learning_rate": 6.621070604169602e-06, + "loss": 0.2095, + "num_tokens": 3006477235.0, + "step": 3941 + }, + { + "epoch": 5.3782659255545004, + "grad_norm": 0.2074449822734452, + "learning_rate": 6.616728447914338e-06, + "loss": 0.2151, + "num_tokens": 3007257369.0, + "step": 3942 + }, + { + "epoch": 5.3796315065131814, + "grad_norm": 0.2222527527049467, + "learning_rate": 6.6123876288054055e-06, + "loss": 0.2055, + "num_tokens": 3008091255.0, + "step": 3943 + }, + { + "epoch": 5.380997087471862, + "grad_norm": 0.2066662667538461, + "learning_rate": 6.608048148166992e-06, + "loss": 0.2095, + "num_tokens": 3008835966.0, + "step": 3944 + }, + { + "epoch": 5.382362668430542, + "grad_norm": 0.2094255872460499, + "learning_rate": 6.60371000732289e-06, + "loss": 0.2044, + "num_tokens": 3009581954.0, + "step": 3945 + }, + { + "epoch": 5.383728249389223, + "grad_norm": 0.24559765003364084, + "learning_rate": 6.59937320759648e-06, + "loss": 0.2014, + "num_tokens": 3010312747.0, + "step": 3946 + }, + { + "epoch": 5.385093830347903, + "grad_norm": 0.20678264104654098, + "learning_rate": 6.595037750310724e-06, + "loss": 0.2072, + "num_tokens": 3011082766.0, + "step": 3947 + }, + { + "epoch": 5.386459411306584, + "grad_norm": 0.2543945449072729, + "learning_rate": 6.590703636788194e-06, + "loss": 0.1934, + "num_tokens": 3011822170.0, + "step": 3948 + }, + { + "epoch": 5.387824992265264, + "grad_norm": 0.22275532818215518, + "learning_rate": 6.586370868351031e-06, + "loss": 0.2119, + "num_tokens": 3012652051.0, + "step": 3949 + }, + { + "epoch": 5.389190573223945, + "grad_norm": 0.19847591928556818, + "learning_rate": 6.5820394463209755e-06, + "loss": 0.2024, + "num_tokens": 3013437448.0, + "step": 3950 + }, + { + "epoch": 5.390556154182625, + "grad_norm": 0.24530013976803167, + "learning_rate": 6.577709372019365e-06, + "loss": 0.2008, + "num_tokens": 3014133575.0, + "step": 3951 + }, + { + "epoch": 5.391921735141306, + "grad_norm": 0.21422848722822987, + "learning_rate": 6.573380646767109e-06, + "loss": 0.2045, + "num_tokens": 3014806358.0, + "step": 3952 + }, + { + "epoch": 5.393287316099986, + "grad_norm": 0.22703608219277344, + "learning_rate": 6.5690532718847165e-06, + "loss": 0.216, + "num_tokens": 3015569583.0, + "step": 3953 + }, + { + "epoch": 5.394652897058666, + "grad_norm": 0.22086839885486104, + "learning_rate": 6.564727248692282e-06, + "loss": 0.2048, + "num_tokens": 3016278910.0, + "step": 3954 + }, + { + "epoch": 5.396018478017347, + "grad_norm": 0.22056733051736155, + "learning_rate": 6.560402578509493e-06, + "loss": 0.2113, + "num_tokens": 3017074705.0, + "step": 3955 + }, + { + "epoch": 5.397384058976027, + "grad_norm": 0.21840213135137446, + "learning_rate": 6.556079262655607e-06, + "loss": 0.2001, + "num_tokens": 3017833239.0, + "step": 3956 + }, + { + "epoch": 5.398749639934708, + "grad_norm": 0.21999095382393496, + "learning_rate": 6.551757302449495e-06, + "loss": 0.2166, + "num_tokens": 3018622790.0, + "step": 3957 + }, + { + "epoch": 5.4001152208933885, + "grad_norm": 0.19957739012206901, + "learning_rate": 6.54743669920959e-06, + "loss": 0.2024, + "num_tokens": 3019347669.0, + "step": 3958 + }, + { + "epoch": 5.4014808018520695, + "grad_norm": 0.22853674952185196, + "learning_rate": 6.5431174542539245e-06, + "loss": 0.2057, + "num_tokens": 3020126173.0, + "step": 3959 + }, + { + "epoch": 5.40284638281075, + "grad_norm": 0.21010204796162435, + "learning_rate": 6.538799568900116e-06, + "loss": 0.2098, + "num_tokens": 3020908127.0, + "step": 3960 + }, + { + "epoch": 5.404211963769431, + "grad_norm": 0.21473003703560234, + "learning_rate": 6.534483044465356e-06, + "loss": 0.2112, + "num_tokens": 3021648448.0, + "step": 3961 + }, + { + "epoch": 5.405577544728111, + "grad_norm": 0.23003744192902606, + "learning_rate": 6.530167882266435e-06, + "loss": 0.198, + "num_tokens": 3022383335.0, + "step": 3962 + }, + { + "epoch": 5.406943125686791, + "grad_norm": 0.21666741927684402, + "learning_rate": 6.525854083619721e-06, + "loss": 0.2122, + "num_tokens": 3023156801.0, + "step": 3963 + }, + { + "epoch": 5.408308706645472, + "grad_norm": 0.2171317227612684, + "learning_rate": 6.52154164984117e-06, + "loss": 0.2041, + "num_tokens": 3023959678.0, + "step": 3964 + }, + { + "epoch": 5.409674287604152, + "grad_norm": 0.2104430500593837, + "learning_rate": 6.5172305822463115e-06, + "loss": 0.2099, + "num_tokens": 3024689913.0, + "step": 3965 + }, + { + "epoch": 5.411039868562833, + "grad_norm": 0.2098890088110271, + "learning_rate": 6.512920882150274e-06, + "loss": 0.2069, + "num_tokens": 3025407564.0, + "step": 3966 + }, + { + "epoch": 5.412405449521513, + "grad_norm": 0.22973634723470993, + "learning_rate": 6.508612550867759e-06, + "loss": 0.2067, + "num_tokens": 3026177276.0, + "step": 3967 + }, + { + "epoch": 5.413771030480194, + "grad_norm": 0.20839545171748086, + "learning_rate": 6.504305589713041e-06, + "loss": 0.204, + "num_tokens": 3026923881.0, + "step": 3968 + }, + { + "epoch": 5.415136611438874, + "grad_norm": 0.22778132849813956, + "learning_rate": 6.500000000000003e-06, + "loss": 0.2087, + "num_tokens": 3027687394.0, + "step": 3969 + }, + { + "epoch": 5.416502192397555, + "grad_norm": 0.21037047184898067, + "learning_rate": 6.495695783042082e-06, + "loss": 0.2061, + "num_tokens": 3028389599.0, + "step": 3970 + }, + { + "epoch": 5.417867773356235, + "grad_norm": 0.2414489609470623, + "learning_rate": 6.491392940152315e-06, + "loss": 0.2095, + "num_tokens": 3029169920.0, + "step": 3971 + }, + { + "epoch": 5.419233354314915, + "grad_norm": 0.2077668625929062, + "learning_rate": 6.487091472643308e-06, + "loss": 0.2017, + "num_tokens": 3029941531.0, + "step": 3972 + }, + { + "epoch": 5.420598935273596, + "grad_norm": 0.22017754187498098, + "learning_rate": 6.482791381827261e-06, + "loss": 0.2062, + "num_tokens": 3030651189.0, + "step": 3973 + }, + { + "epoch": 5.4219645162322765, + "grad_norm": 0.2316252835736614, + "learning_rate": 6.478492669015932e-06, + "loss": 0.1981, + "num_tokens": 3031366648.0, + "step": 3974 + }, + { + "epoch": 5.4233300971909575, + "grad_norm": 0.20762883696792797, + "learning_rate": 6.474195335520687e-06, + "loss": 0.2027, + "num_tokens": 3032151233.0, + "step": 3975 + }, + { + "epoch": 5.424695678149638, + "grad_norm": 0.24004211229214478, + "learning_rate": 6.4698993826524485e-06, + "loss": 0.2082, + "num_tokens": 3032860335.0, + "step": 3976 + }, + { + "epoch": 5.426061259108319, + "grad_norm": 0.2136522610963918, + "learning_rate": 6.46560481172172e-06, + "loss": 0.2063, + "num_tokens": 3033667505.0, + "step": 3977 + }, + { + "epoch": 5.427426840066999, + "grad_norm": 0.20298596822369086, + "learning_rate": 6.461311624038602e-06, + "loss": 0.2105, + "num_tokens": 3034450014.0, + "step": 3978 + }, + { + "epoch": 5.42879242102568, + "grad_norm": 0.20205763260334447, + "learning_rate": 6.457019820912751e-06, + "loss": 0.2037, + "num_tokens": 3035308364.0, + "step": 3979 + }, + { + "epoch": 5.43015800198436, + "grad_norm": 0.2162672521254492, + "learning_rate": 6.4527294036534115e-06, + "loss": 0.2094, + "num_tokens": 3036121851.0, + "step": 3980 + }, + { + "epoch": 5.43152358294304, + "grad_norm": 0.2250839791990096, + "learning_rate": 6.448440373569407e-06, + "loss": 0.2052, + "num_tokens": 3036833305.0, + "step": 3981 + }, + { + "epoch": 5.432889163901721, + "grad_norm": 0.217943459385609, + "learning_rate": 6.444152731969135e-06, + "loss": 0.2062, + "num_tokens": 3037622713.0, + "step": 3982 + }, + { + "epoch": 5.434254744860401, + "grad_norm": 0.1951198917997279, + "learning_rate": 6.439866480160566e-06, + "loss": 0.206, + "num_tokens": 3038534317.0, + "step": 3983 + }, + { + "epoch": 5.435620325819082, + "grad_norm": 0.2097948606629585, + "learning_rate": 6.43558161945125e-06, + "loss": 0.2017, + "num_tokens": 3039338160.0, + "step": 3984 + }, + { + "epoch": 5.436985906777762, + "grad_norm": 0.21817028082996742, + "learning_rate": 6.4312981511483175e-06, + "loss": 0.2069, + "num_tokens": 3040113293.0, + "step": 3985 + }, + { + "epoch": 5.438351487736443, + "grad_norm": 0.22264698923364068, + "learning_rate": 6.427016076558459e-06, + "loss": 0.2148, + "num_tokens": 3040894044.0, + "step": 3986 + }, + { + "epoch": 5.439717068695123, + "grad_norm": 0.20188599497247336, + "learning_rate": 6.422735396987963e-06, + "loss": 0.2046, + "num_tokens": 3041723690.0, + "step": 3987 + }, + { + "epoch": 5.441082649653804, + "grad_norm": 0.19903786129190232, + "learning_rate": 6.418456113742668e-06, + "loss": 0.2082, + "num_tokens": 3042486863.0, + "step": 3988 + }, + { + "epoch": 5.442448230612484, + "grad_norm": 0.20804889391265294, + "learning_rate": 6.414178228128005e-06, + "loss": 0.1998, + "num_tokens": 3043273006.0, + "step": 3989 + }, + { + "epoch": 5.4438138115711645, + "grad_norm": 0.2160420412197853, + "learning_rate": 6.409901741448972e-06, + "loss": 0.2067, + "num_tokens": 3043955791.0, + "step": 3990 + }, + { + "epoch": 5.4451793925298455, + "grad_norm": 0.20425636187548976, + "learning_rate": 6.405626655010133e-06, + "loss": 0.2066, + "num_tokens": 3044834069.0, + "step": 3991 + }, + { + "epoch": 5.446544973488526, + "grad_norm": 0.21246159659244993, + "learning_rate": 6.401352970115638e-06, + "loss": 0.2104, + "num_tokens": 3045602629.0, + "step": 3992 + }, + { + "epoch": 5.447910554447207, + "grad_norm": 0.21886231902836617, + "learning_rate": 6.397080688069201e-06, + "loss": 0.2139, + "num_tokens": 3046360595.0, + "step": 3993 + }, + { + "epoch": 5.449276135405887, + "grad_norm": 0.20539537523016446, + "learning_rate": 6.392809810174114e-06, + "loss": 0.2075, + "num_tokens": 3047199492.0, + "step": 3994 + }, + { + "epoch": 5.450641716364568, + "grad_norm": 0.229334453274223, + "learning_rate": 6.3885403377332265e-06, + "loss": 0.1953, + "num_tokens": 3047803460.0, + "step": 3995 + }, + { + "epoch": 5.452007297323248, + "grad_norm": 0.22410905597612238, + "learning_rate": 6.384272272048983e-06, + "loss": 0.2075, + "num_tokens": 3048637447.0, + "step": 3996 + }, + { + "epoch": 5.453372878281929, + "grad_norm": 0.22274573284051796, + "learning_rate": 6.380005614423377e-06, + "loss": 0.2157, + "num_tokens": 3049428743.0, + "step": 3997 + }, + { + "epoch": 5.454738459240609, + "grad_norm": 0.21258309147166182, + "learning_rate": 6.375740366157981e-06, + "loss": 0.2069, + "num_tokens": 3050164842.0, + "step": 3998 + }, + { + "epoch": 5.456104040199289, + "grad_norm": 0.21520306475815004, + "learning_rate": 6.371476528553944e-06, + "loss": 0.1981, + "num_tokens": 3050907511.0, + "step": 3999 + }, + { + "epoch": 5.45746962115797, + "grad_norm": 0.20122939015207728, + "learning_rate": 6.367214102911972e-06, + "loss": 0.2069, + "num_tokens": 3051682836.0, + "step": 4000 + }, + { + "epoch": 5.45883520211665, + "grad_norm": 0.22592501514985766, + "learning_rate": 6.3629530905323464e-06, + "loss": 0.2116, + "num_tokens": 3052478045.0, + "step": 4001 + }, + { + "epoch": 5.460200783075331, + "grad_norm": 0.2176108478543041, + "learning_rate": 6.35869349271492e-06, + "loss": 0.2066, + "num_tokens": 3053210446.0, + "step": 4002 + }, + { + "epoch": 5.461566364034011, + "grad_norm": 0.20674467800831395, + "learning_rate": 6.354435310759111e-06, + "loss": 0.2073, + "num_tokens": 3054006308.0, + "step": 4003 + }, + { + "epoch": 5.462931944992692, + "grad_norm": 0.2345891668369277, + "learning_rate": 6.350178545963905e-06, + "loss": 0.2126, + "num_tokens": 3054741990.0, + "step": 4004 + }, + { + "epoch": 5.464297525951372, + "grad_norm": 0.23176850966060938, + "learning_rate": 6.345923199627864e-06, + "loss": 0.2199, + "num_tokens": 3055526044.0, + "step": 4005 + }, + { + "epoch": 5.465663106910053, + "grad_norm": 0.22589339573453995, + "learning_rate": 6.3416692730490996e-06, + "loss": 0.2077, + "num_tokens": 3056252404.0, + "step": 4006 + }, + { + "epoch": 5.4670286878687335, + "grad_norm": 0.23358481396508393, + "learning_rate": 6.337416767525304e-06, + "loss": 0.2117, + "num_tokens": 3056966395.0, + "step": 4007 + }, + { + "epoch": 5.468394268827414, + "grad_norm": 0.21025613402496268, + "learning_rate": 6.333165684353737e-06, + "loss": 0.2035, + "num_tokens": 3057664301.0, + "step": 4008 + }, + { + "epoch": 5.469759849786095, + "grad_norm": 0.251703696814109, + "learning_rate": 6.328916024831213e-06, + "loss": 0.2175, + "num_tokens": 3058438685.0, + "step": 4009 + }, + { + "epoch": 5.471125430744775, + "grad_norm": 0.2262444978695399, + "learning_rate": 6.324667790254122e-06, + "loss": 0.2051, + "num_tokens": 3059214596.0, + "step": 4010 + }, + { + "epoch": 5.472491011703456, + "grad_norm": 0.21644858962503064, + "learning_rate": 6.320420981918418e-06, + "loss": 0.2081, + "num_tokens": 3060016497.0, + "step": 4011 + }, + { + "epoch": 5.473856592662136, + "grad_norm": 0.22032054477746746, + "learning_rate": 6.316175601119616e-06, + "loss": 0.1995, + "num_tokens": 3060726131.0, + "step": 4012 + }, + { + "epoch": 5.475222173620817, + "grad_norm": 0.20388210939366752, + "learning_rate": 6.311931649152798e-06, + "loss": 0.2111, + "num_tokens": 3061575665.0, + "step": 4013 + }, + { + "epoch": 5.476587754579497, + "grad_norm": 0.22081055704488686, + "learning_rate": 6.307689127312618e-06, + "loss": 0.2073, + "num_tokens": 3062279694.0, + "step": 4014 + }, + { + "epoch": 5.477953335538178, + "grad_norm": 0.20898105737558534, + "learning_rate": 6.3034480368932715e-06, + "loss": 0.2126, + "num_tokens": 3063111464.0, + "step": 4015 + }, + { + "epoch": 5.479318916496858, + "grad_norm": 0.22929244385398376, + "learning_rate": 6.299208379188539e-06, + "loss": 0.2107, + "num_tokens": 3063836991.0, + "step": 4016 + }, + { + "epoch": 5.480684497455538, + "grad_norm": 0.2143771747382398, + "learning_rate": 6.2949701554917595e-06, + "loss": 0.2031, + "num_tokens": 3064568829.0, + "step": 4017 + }, + { + "epoch": 5.482050078414219, + "grad_norm": 0.20987728909390047, + "learning_rate": 6.290733367095826e-06, + "loss": 0.2094, + "num_tokens": 3065329798.0, + "step": 4018 + }, + { + "epoch": 5.483415659372899, + "grad_norm": 0.21117604972376425, + "learning_rate": 6.2864980152932e-06, + "loss": 0.2105, + "num_tokens": 3066050636.0, + "step": 4019 + }, + { + "epoch": 5.48478124033158, + "grad_norm": 0.2327515751279894, + "learning_rate": 6.282264101375906e-06, + "loss": 0.2071, + "num_tokens": 3066815131.0, + "step": 4020 + }, + { + "epoch": 5.48614682129026, + "grad_norm": 0.24443044086385796, + "learning_rate": 6.278031626635527e-06, + "loss": 0.2045, + "num_tokens": 3067613614.0, + "step": 4021 + }, + { + "epoch": 5.487512402248941, + "grad_norm": 0.22146707077219002, + "learning_rate": 6.273800592363211e-06, + "loss": 0.2161, + "num_tokens": 3068410579.0, + "step": 4022 + }, + { + "epoch": 5.4888779832076215, + "grad_norm": 0.19455245649918942, + "learning_rate": 6.269570999849655e-06, + "loss": 0.2127, + "num_tokens": 3069256654.0, + "step": 4023 + }, + { + "epoch": 5.4902435641663025, + "grad_norm": 0.21497086412925887, + "learning_rate": 6.265342850385131e-06, + "loss": 0.2065, + "num_tokens": 3070071738.0, + "step": 4024 + }, + { + "epoch": 5.491609145124983, + "grad_norm": 0.201725131659208, + "learning_rate": 6.261116145259462e-06, + "loss": 0.218, + "num_tokens": 3070909780.0, + "step": 4025 + }, + { + "epoch": 5.492974726083663, + "grad_norm": 0.219728704790792, + "learning_rate": 6.256890885762037e-06, + "loss": 0.2027, + "num_tokens": 3071648369.0, + "step": 4026 + }, + { + "epoch": 5.494340307042344, + "grad_norm": 0.2049371855384657, + "learning_rate": 6.2526670731817925e-06, + "loss": 0.2039, + "num_tokens": 3072395992.0, + "step": 4027 + }, + { + "epoch": 5.495705888001024, + "grad_norm": 0.20897281764403916, + "learning_rate": 6.248444708807235e-06, + "loss": 0.2042, + "num_tokens": 3073163013.0, + "step": 4028 + }, + { + "epoch": 5.497071468959705, + "grad_norm": 0.2085776967673941, + "learning_rate": 6.244223793926429e-06, + "loss": 0.2106, + "num_tokens": 3073916595.0, + "step": 4029 + }, + { + "epoch": 5.498437049918385, + "grad_norm": 0.22237470330227166, + "learning_rate": 6.240004329826981e-06, + "loss": 0.206, + "num_tokens": 3074579261.0, + "step": 4030 + }, + { + "epoch": 5.499802630877066, + "grad_norm": 0.22534106077116134, + "learning_rate": 6.235786317796079e-06, + "loss": 0.207, + "num_tokens": 3075404828.0, + "step": 4031 + }, + { + "epoch": 5.501168211835746, + "grad_norm": 0.20104296681205794, + "learning_rate": 6.231569759120448e-06, + "loss": 0.2027, + "num_tokens": 3076147419.0, + "step": 4032 + }, + { + "epoch": 5.502533792794427, + "grad_norm": 0.225964785451297, + "learning_rate": 6.2273546550863815e-06, + "loss": 0.2086, + "num_tokens": 3076899160.0, + "step": 4033 + }, + { + "epoch": 5.503899373753107, + "grad_norm": 0.21971665098145374, + "learning_rate": 6.2231410069797245e-06, + "loss": 0.2029, + "num_tokens": 3077621583.0, + "step": 4034 + }, + { + "epoch": 5.505264954711787, + "grad_norm": 0.2122077600393208, + "learning_rate": 6.218928816085881e-06, + "loss": 0.1985, + "num_tokens": 3078382603.0, + "step": 4035 + }, + { + "epoch": 5.506630535670468, + "grad_norm": 0.2183234093717378, + "learning_rate": 6.214718083689797e-06, + "loss": 0.2071, + "num_tokens": 3079120246.0, + "step": 4036 + }, + { + "epoch": 5.5079961166291485, + "grad_norm": 0.25820487067874803, + "learning_rate": 6.210508811076e-06, + "loss": 0.2082, + "num_tokens": 3079897172.0, + "step": 4037 + }, + { + "epoch": 5.5093616975878295, + "grad_norm": 0.21311041492054736, + "learning_rate": 6.2063009995285495e-06, + "loss": 0.2045, + "num_tokens": 3080690619.0, + "step": 4038 + }, + { + "epoch": 5.51072727854651, + "grad_norm": 0.2084524522622275, + "learning_rate": 6.2020946503310604e-06, + "loss": 0.2093, + "num_tokens": 3081505700.0, + "step": 4039 + }, + { + "epoch": 5.512092859505191, + "grad_norm": 0.21341027101383142, + "learning_rate": 6.197889764766722e-06, + "loss": 0.2102, + "num_tokens": 3082290722.0, + "step": 4040 + }, + { + "epoch": 5.513458440463871, + "grad_norm": 0.20357063469703732, + "learning_rate": 6.193686344118249e-06, + "loss": 0.2128, + "num_tokens": 3083052892.0, + "step": 4041 + }, + { + "epoch": 5.514824021422552, + "grad_norm": 0.21215134787095122, + "learning_rate": 6.189484389667929e-06, + "loss": 0.2017, + "num_tokens": 3083776991.0, + "step": 4042 + }, + { + "epoch": 5.516189602381232, + "grad_norm": 0.2175171423517633, + "learning_rate": 6.185283902697596e-06, + "loss": 0.1974, + "num_tokens": 3084468521.0, + "step": 4043 + }, + { + "epoch": 5.517555183339912, + "grad_norm": 0.21498635040138891, + "learning_rate": 6.181084884488642e-06, + "loss": 0.2012, + "num_tokens": 3085226689.0, + "step": 4044 + }, + { + "epoch": 5.518920764298593, + "grad_norm": 0.2046053770235797, + "learning_rate": 6.1768873363219935e-06, + "loss": 0.2085, + "num_tokens": 3086000490.0, + "step": 4045 + }, + { + "epoch": 5.520286345257273, + "grad_norm": 0.20763218667975852, + "learning_rate": 6.172691259478151e-06, + "loss": 0.2063, + "num_tokens": 3086797562.0, + "step": 4046 + }, + { + "epoch": 5.521651926215954, + "grad_norm": 0.2204174048373065, + "learning_rate": 6.168496655237153e-06, + "loss": 0.2066, + "num_tokens": 3087569172.0, + "step": 4047 + }, + { + "epoch": 5.523017507174634, + "grad_norm": 0.22262146601577268, + "learning_rate": 6.164303524878585e-06, + "loss": 0.2049, + "num_tokens": 3088338864.0, + "step": 4048 + }, + { + "epoch": 5.524383088133315, + "grad_norm": 0.20773856372480864, + "learning_rate": 6.160111869681602e-06, + "loss": 0.1982, + "num_tokens": 3089019602.0, + "step": 4049 + }, + { + "epoch": 5.525748669091995, + "grad_norm": 0.21064764970965, + "learning_rate": 6.155921690924887e-06, + "loss": 0.2048, + "num_tokens": 3089852351.0, + "step": 4050 + }, + { + "epoch": 5.527114250050676, + "grad_norm": 0.21593256174257552, + "learning_rate": 6.151732989886685e-06, + "loss": 0.2072, + "num_tokens": 3090563410.0, + "step": 4051 + }, + { + "epoch": 5.528479831009356, + "grad_norm": 0.21493090892184094, + "learning_rate": 6.1475457678447855e-06, + "loss": 0.2143, + "num_tokens": 3091345855.0, + "step": 4052 + }, + { + "epoch": 5.5298454119680365, + "grad_norm": 0.22296841197131737, + "learning_rate": 6.1433600260765345e-06, + "loss": 0.2115, + "num_tokens": 3092141335.0, + "step": 4053 + }, + { + "epoch": 5.5312109929267175, + "grad_norm": 0.2190920650727816, + "learning_rate": 6.139175765858814e-06, + "loss": 0.2115, + "num_tokens": 3092951165.0, + "step": 4054 + }, + { + "epoch": 5.532576573885398, + "grad_norm": 0.21489894620353736, + "learning_rate": 6.134992988468063e-06, + "loss": 0.2175, + "num_tokens": 3093795756.0, + "step": 4055 + }, + { + "epoch": 5.533942154844079, + "grad_norm": 0.22743818771160382, + "learning_rate": 6.130811695180268e-06, + "loss": 0.2075, + "num_tokens": 3094598915.0, + "step": 4056 + }, + { + "epoch": 5.535307735802759, + "grad_norm": 0.2188088889598014, + "learning_rate": 6.126631887270951e-06, + "loss": 0.2038, + "num_tokens": 3095311056.0, + "step": 4057 + }, + { + "epoch": 5.53667331676144, + "grad_norm": 0.20591357683576897, + "learning_rate": 6.122453566015205e-06, + "loss": 0.2088, + "num_tokens": 3096137685.0, + "step": 4058 + }, + { + "epoch": 5.53803889772012, + "grad_norm": 0.20199409820903227, + "learning_rate": 6.1182767326876444e-06, + "loss": 0.2073, + "num_tokens": 3096925322.0, + "step": 4059 + }, + { + "epoch": 5.539404478678801, + "grad_norm": 0.2270911282985042, + "learning_rate": 6.1141013885624434e-06, + "loss": 0.211, + "num_tokens": 3097668110.0, + "step": 4060 + }, + { + "epoch": 5.540770059637481, + "grad_norm": 0.21858224468166615, + "learning_rate": 6.10992753491332e-06, + "loss": 0.2154, + "num_tokens": 3098403403.0, + "step": 4061 + }, + { + "epoch": 5.542135640596161, + "grad_norm": 0.2123622401770957, + "learning_rate": 6.105755173013533e-06, + "loss": 0.2004, + "num_tokens": 3099170392.0, + "step": 4062 + }, + { + "epoch": 5.543501221554842, + "grad_norm": 0.21016835185898516, + "learning_rate": 6.101584304135891e-06, + "loss": 0.2101, + "num_tokens": 3099942311.0, + "step": 4063 + }, + { + "epoch": 5.544866802513522, + "grad_norm": 0.20522968483676618, + "learning_rate": 6.0974149295527454e-06, + "loss": 0.2014, + "num_tokens": 3100812410.0, + "step": 4064 + }, + { + "epoch": 5.546232383472203, + "grad_norm": 0.1999648359479289, + "learning_rate": 6.093247050535995e-06, + "loss": 0.2116, + "num_tokens": 3101666417.0, + "step": 4065 + }, + { + "epoch": 5.547597964430883, + "grad_norm": 0.2040894145205277, + "learning_rate": 6.089080668357069e-06, + "loss": 0.2128, + "num_tokens": 3102413714.0, + "step": 4066 + }, + { + "epoch": 5.548963545389564, + "grad_norm": 0.22416274919145654, + "learning_rate": 6.084915784286964e-06, + "loss": 0.2013, + "num_tokens": 3103176374.0, + "step": 4067 + }, + { + "epoch": 5.550329126348244, + "grad_norm": 0.1951795178098006, + "learning_rate": 6.080752399596197e-06, + "loss": 0.1915, + "num_tokens": 3103846975.0, + "step": 4068 + }, + { + "epoch": 5.551694707306925, + "grad_norm": 0.23006215214645082, + "learning_rate": 6.076590515554836e-06, + "loss": 0.2142, + "num_tokens": 3104597477.0, + "step": 4069 + }, + { + "epoch": 5.5530602882656055, + "grad_norm": 0.22122603193311416, + "learning_rate": 6.072430133432499e-06, + "loss": 0.2113, + "num_tokens": 3105344862.0, + "step": 4070 + }, + { + "epoch": 5.554425869224286, + "grad_norm": 0.2196496386678765, + "learning_rate": 6.068271254498329e-06, + "loss": 0.2005, + "num_tokens": 3106093499.0, + "step": 4071 + }, + { + "epoch": 5.555791450182967, + "grad_norm": 0.21178106363685606, + "learning_rate": 6.0641138800210255e-06, + "loss": 0.2173, + "num_tokens": 3106863822.0, + "step": 4072 + }, + { + "epoch": 5.557157031141647, + "grad_norm": 0.2057309049225531, + "learning_rate": 6.059958011268822e-06, + "loss": 0.2068, + "num_tokens": 3107622345.0, + "step": 4073 + }, + { + "epoch": 5.558522612100328, + "grad_norm": 0.21712802466768952, + "learning_rate": 6.055803649509498e-06, + "loss": 0.2063, + "num_tokens": 3108354943.0, + "step": 4074 + }, + { + "epoch": 5.559888193059008, + "grad_norm": 0.20870504984157687, + "learning_rate": 6.0516507960103565e-06, + "loss": 0.2106, + "num_tokens": 3109129219.0, + "step": 4075 + }, + { + "epoch": 5.561253774017689, + "grad_norm": 0.317943053362746, + "learning_rate": 6.04749945203827e-06, + "loss": 0.207, + "num_tokens": 3109889531.0, + "step": 4076 + }, + { + "epoch": 5.562619354976369, + "grad_norm": 0.22457200584632273, + "learning_rate": 6.0433496188596245e-06, + "loss": 0.2098, + "num_tokens": 3110625203.0, + "step": 4077 + }, + { + "epoch": 5.56398493593505, + "grad_norm": 0.20949840851837406, + "learning_rate": 6.039201297740355e-06, + "loss": 0.21, + "num_tokens": 3111468690.0, + "step": 4078 + }, + { + "epoch": 5.56535051689373, + "grad_norm": 0.2134721578662111, + "learning_rate": 6.035054489945939e-06, + "loss": 0.203, + "num_tokens": 3112182456.0, + "step": 4079 + }, + { + "epoch": 5.56671609785241, + "grad_norm": 0.2143516903383674, + "learning_rate": 6.030909196741382e-06, + "loss": 0.2097, + "num_tokens": 3112976043.0, + "step": 4080 + }, + { + "epoch": 5.568081678811091, + "grad_norm": 0.2128802746430646, + "learning_rate": 6.026765419391234e-06, + "loss": 0.2033, + "num_tokens": 3113768274.0, + "step": 4081 + }, + { + "epoch": 5.569447259769771, + "grad_norm": 0.2079684664105549, + "learning_rate": 6.022623159159584e-06, + "loss": 0.2006, + "num_tokens": 3114557570.0, + "step": 4082 + }, + { + "epoch": 5.570812840728452, + "grad_norm": 0.18879824208793158, + "learning_rate": 6.018482417310062e-06, + "loss": 0.2037, + "num_tokens": 3115328265.0, + "step": 4083 + }, + { + "epoch": 5.572178421687132, + "grad_norm": 0.22009167294925178, + "learning_rate": 6.014343195105818e-06, + "loss": 0.2072, + "num_tokens": 3116078488.0, + "step": 4084 + }, + { + "epoch": 5.573544002645813, + "grad_norm": 0.20024304589917835, + "learning_rate": 6.010205493809555e-06, + "loss": 0.2077, + "num_tokens": 3116872104.0, + "step": 4085 + }, + { + "epoch": 5.5749095836044935, + "grad_norm": 0.22655369007522966, + "learning_rate": 6.006069314683507e-06, + "loss": 0.2132, + "num_tokens": 3117603523.0, + "step": 4086 + }, + { + "epoch": 5.5762751645631745, + "grad_norm": 0.21564135808790766, + "learning_rate": 6.001934658989442e-06, + "loss": 0.2124, + "num_tokens": 3118463469.0, + "step": 4087 + }, + { + "epoch": 5.577640745521855, + "grad_norm": 0.20648270669668944, + "learning_rate": 5.997801527988669e-06, + "loss": 0.1977, + "num_tokens": 3119223708.0, + "step": 4088 + }, + { + "epoch": 5.579006326480535, + "grad_norm": 0.2106464330369209, + "learning_rate": 5.993669922942018e-06, + "loss": 0.2009, + "num_tokens": 3119923912.0, + "step": 4089 + }, + { + "epoch": 5.580371907439216, + "grad_norm": 0.21273269270820844, + "learning_rate": 5.989539845109868e-06, + "loss": 0.1992, + "num_tokens": 3120739563.0, + "step": 4090 + }, + { + "epoch": 5.581737488397896, + "grad_norm": 0.21097021160117405, + "learning_rate": 5.985411295752128e-06, + "loss": 0.2031, + "num_tokens": 3121473005.0, + "step": 4091 + }, + { + "epoch": 5.583103069356577, + "grad_norm": 0.23030349271592465, + "learning_rate": 5.981284276128238e-06, + "loss": 0.2032, + "num_tokens": 3122266349.0, + "step": 4092 + }, + { + "epoch": 5.584468650315257, + "grad_norm": 0.2021921358725317, + "learning_rate": 5.977158787497175e-06, + "loss": 0.198, + "num_tokens": 3123071981.0, + "step": 4093 + }, + { + "epoch": 5.585834231273938, + "grad_norm": 0.20716562478186848, + "learning_rate": 5.973034831117444e-06, + "loss": 0.2029, + "num_tokens": 3123918070.0, + "step": 4094 + }, + { + "epoch": 5.587199812232618, + "grad_norm": 0.21887392955435184, + "learning_rate": 5.968912408247085e-06, + "loss": 0.2085, + "num_tokens": 3124719503.0, + "step": 4095 + }, + { + "epoch": 5.588565393191299, + "grad_norm": 0.21923978832350263, + "learning_rate": 5.9647915201436735e-06, + "loss": 0.2121, + "num_tokens": 3125485855.0, + "step": 4096 + }, + { + "epoch": 5.589930974149979, + "grad_norm": 0.19992748321655135, + "learning_rate": 5.960672168064316e-06, + "loss": 0.2002, + "num_tokens": 3126223479.0, + "step": 4097 + }, + { + "epoch": 5.591296555108659, + "grad_norm": 0.20946537043658145, + "learning_rate": 5.956554353265642e-06, + "loss": 0.2081, + "num_tokens": 3126977866.0, + "step": 4098 + }, + { + "epoch": 5.59266213606734, + "grad_norm": 0.21736264417558654, + "learning_rate": 5.952438077003822e-06, + "loss": 0.2135, + "num_tokens": 3127721005.0, + "step": 4099 + }, + { + "epoch": 5.59402771702602, + "grad_norm": 0.22115569313970443, + "learning_rate": 5.948323340534554e-06, + "loss": 0.2088, + "num_tokens": 3128438761.0, + "step": 4100 + }, + { + "epoch": 5.595393297984701, + "grad_norm": 0.22903359722893876, + "learning_rate": 5.944210145113063e-06, + "loss": 0.2013, + "num_tokens": 3129258469.0, + "step": 4101 + }, + { + "epoch": 5.5967588789433815, + "grad_norm": 0.20861779509467682, + "learning_rate": 5.940098491994113e-06, + "loss": 0.2072, + "num_tokens": 3130016861.0, + "step": 4102 + }, + { + "epoch": 5.5981244599020625, + "grad_norm": 0.23941779732263477, + "learning_rate": 5.935988382431984e-06, + "loss": 0.2091, + "num_tokens": 3130761304.0, + "step": 4103 + }, + { + "epoch": 5.599490040860743, + "grad_norm": 0.19040778178950618, + "learning_rate": 5.931879817680497e-06, + "loss": 0.2172, + "num_tokens": 3131668800.0, + "step": 4104 + }, + { + "epoch": 5.600855621819424, + "grad_norm": 0.2114857516777645, + "learning_rate": 5.9277727989929945e-06, + "loss": 0.1997, + "num_tokens": 3132386950.0, + "step": 4105 + }, + { + "epoch": 5.602221202778104, + "grad_norm": 0.2058526650825957, + "learning_rate": 5.923667327622355e-06, + "loss": 0.1993, + "num_tokens": 3133173587.0, + "step": 4106 + }, + { + "epoch": 5.603586783736784, + "grad_norm": 0.21651482145756312, + "learning_rate": 5.919563404820976e-06, + "loss": 0.2094, + "num_tokens": 3133967098.0, + "step": 4107 + }, + { + "epoch": 5.604952364695465, + "grad_norm": 0.270240867258495, + "learning_rate": 5.915461031840784e-06, + "loss": 0.2092, + "num_tokens": 3134726030.0, + "step": 4108 + }, + { + "epoch": 5.606317945654145, + "grad_norm": 0.21973160496800873, + "learning_rate": 5.911360209933244e-06, + "loss": 0.1999, + "num_tokens": 3135477071.0, + "step": 4109 + }, + { + "epoch": 5.607683526612826, + "grad_norm": 0.2326094519295077, + "learning_rate": 5.907260940349324e-06, + "loss": 0.2113, + "num_tokens": 3136310599.0, + "step": 4110 + }, + { + "epoch": 5.609049107571506, + "grad_norm": 0.20494262349505174, + "learning_rate": 5.903163224339553e-06, + "loss": 0.2042, + "num_tokens": 3137059520.0, + "step": 4111 + }, + { + "epoch": 5.610414688530187, + "grad_norm": 0.21660359980031887, + "learning_rate": 5.899067063153953e-06, + "loss": 0.208, + "num_tokens": 3137818275.0, + "step": 4112 + }, + { + "epoch": 5.611780269488867, + "grad_norm": 0.20562294403696807, + "learning_rate": 5.894972458042088e-06, + "loss": 0.2075, + "num_tokens": 3138555024.0, + "step": 4113 + }, + { + "epoch": 5.613145850447548, + "grad_norm": 0.19581591305992788, + "learning_rate": 5.890879410253049e-06, + "loss": 0.2094, + "num_tokens": 3139425746.0, + "step": 4114 + }, + { + "epoch": 5.614511431406228, + "grad_norm": 0.2110429235340085, + "learning_rate": 5.8867879210354465e-06, + "loss": 0.2067, + "num_tokens": 3140206696.0, + "step": 4115 + }, + { + "epoch": 5.615877012364908, + "grad_norm": 0.2192327126848145, + "learning_rate": 5.88269799163741e-06, + "loss": 0.2059, + "num_tokens": 3140937706.0, + "step": 4116 + }, + { + "epoch": 5.617242593323589, + "grad_norm": 0.21976734388039076, + "learning_rate": 5.878609623306606e-06, + "loss": 0.2001, + "num_tokens": 3141740337.0, + "step": 4117 + }, + { + "epoch": 5.6186081742822696, + "grad_norm": 0.20077167780228503, + "learning_rate": 5.87452281729022e-06, + "loss": 0.2055, + "num_tokens": 3142455034.0, + "step": 4118 + }, + { + "epoch": 5.6199737552409506, + "grad_norm": 0.2253784301744551, + "learning_rate": 5.870437574834953e-06, + "loss": 0.2114, + "num_tokens": 3143258328.0, + "step": 4119 + }, + { + "epoch": 5.621339336199631, + "grad_norm": 0.2226388452353918, + "learning_rate": 5.866353897187043e-06, + "loss": 0.2101, + "num_tokens": 3143988079.0, + "step": 4120 + }, + { + "epoch": 5.622704917158312, + "grad_norm": 0.22258174195057312, + "learning_rate": 5.862271785592238e-06, + "loss": 0.2112, + "num_tokens": 3144745912.0, + "step": 4121 + }, + { + "epoch": 5.624070498116992, + "grad_norm": 0.22498194746596176, + "learning_rate": 5.858191241295814e-06, + "loss": 0.2099, + "num_tokens": 3145570708.0, + "step": 4122 + }, + { + "epoch": 5.625436079075673, + "grad_norm": 0.1908574736654714, + "learning_rate": 5.854112265542575e-06, + "loss": 0.21, + "num_tokens": 3146417194.0, + "step": 4123 + }, + { + "epoch": 5.626801660034353, + "grad_norm": 0.20034749822455747, + "learning_rate": 5.850034859576827e-06, + "loss": 0.202, + "num_tokens": 3147240342.0, + "step": 4124 + }, + { + "epoch": 5.628167240993033, + "grad_norm": 0.21920728416392513, + "learning_rate": 5.84595902464242e-06, + "loss": 0.2155, + "num_tokens": 3148007324.0, + "step": 4125 + }, + { + "epoch": 5.629532821951714, + "grad_norm": 0.21035478817512437, + "learning_rate": 5.841884761982712e-06, + "loss": 0.2152, + "num_tokens": 3148783698.0, + "step": 4126 + }, + { + "epoch": 5.630898402910394, + "grad_norm": 0.21925088626475825, + "learning_rate": 5.837812072840589e-06, + "loss": 0.2115, + "num_tokens": 3149564510.0, + "step": 4127 + }, + { + "epoch": 5.632263983869075, + "grad_norm": 0.23236973787996654, + "learning_rate": 5.833740958458438e-06, + "loss": 0.2161, + "num_tokens": 3150325159.0, + "step": 4128 + }, + { + "epoch": 5.633629564827755, + "grad_norm": 0.21280160402838733, + "learning_rate": 5.829671420078197e-06, + "loss": 0.2118, + "num_tokens": 3151096884.0, + "step": 4129 + }, + { + "epoch": 5.634995145786436, + "grad_norm": 0.22114138329853644, + "learning_rate": 5.8256034589412935e-06, + "loss": 0.2025, + "num_tokens": 3151809893.0, + "step": 4130 + }, + { + "epoch": 5.636360726745116, + "grad_norm": 0.22964948640905392, + "learning_rate": 5.821537076288691e-06, + "loss": 0.1935, + "num_tokens": 3152617124.0, + "step": 4131 + }, + { + "epoch": 5.637726307703797, + "grad_norm": 0.20902028762433972, + "learning_rate": 5.817472273360869e-06, + "loss": 0.2088, + "num_tokens": 3153397838.0, + "step": 4132 + }, + { + "epoch": 5.6390918886624775, + "grad_norm": 0.1970712489301563, + "learning_rate": 5.813409051397818e-06, + "loss": 0.2047, + "num_tokens": 3154158712.0, + "step": 4133 + }, + { + "epoch": 5.640457469621158, + "grad_norm": 0.22073528222486, + "learning_rate": 5.809347411639052e-06, + "loss": 0.2129, + "num_tokens": 3154899646.0, + "step": 4134 + }, + { + "epoch": 5.641823050579839, + "grad_norm": 0.22631562477108055, + "learning_rate": 5.805287355323606e-06, + "loss": 0.2036, + "num_tokens": 3155668206.0, + "step": 4135 + }, + { + "epoch": 5.643188631538519, + "grad_norm": 0.22287095708652782, + "learning_rate": 5.801228883690025e-06, + "loss": 0.2036, + "num_tokens": 3156406095.0, + "step": 4136 + }, + { + "epoch": 5.6445542124972, + "grad_norm": 0.21522923227665347, + "learning_rate": 5.797171997976364e-06, + "loss": 0.21, + "num_tokens": 3157169685.0, + "step": 4137 + }, + { + "epoch": 5.64591979345588, + "grad_norm": 0.21668991876207291, + "learning_rate": 5.793116699420218e-06, + "loss": 0.2061, + "num_tokens": 3157876834.0, + "step": 4138 + }, + { + "epoch": 5.647285374414561, + "grad_norm": 0.22472686343258116, + "learning_rate": 5.789062989258677e-06, + "loss": 0.2107, + "num_tokens": 3158662827.0, + "step": 4139 + }, + { + "epoch": 5.648650955373241, + "grad_norm": 0.20152314247855102, + "learning_rate": 5.785010868728345e-06, + "loss": 0.2068, + "num_tokens": 3159472617.0, + "step": 4140 + }, + { + "epoch": 5.650016536331922, + "grad_norm": 0.21363527709941427, + "learning_rate": 5.780960339065361e-06, + "loss": 0.2087, + "num_tokens": 3160301663.0, + "step": 4141 + }, + { + "epoch": 5.651382117290602, + "grad_norm": 0.1980537890775585, + "learning_rate": 5.7769114015053575e-06, + "loss": 0.2113, + "num_tokens": 3161069948.0, + "step": 4142 + }, + { + "epoch": 5.652747698249282, + "grad_norm": 0.2290609837328324, + "learning_rate": 5.772864057283492e-06, + "loss": 0.2113, + "num_tokens": 3161789416.0, + "step": 4143 + }, + { + "epoch": 5.654113279207963, + "grad_norm": 0.21234301297175123, + "learning_rate": 5.768818307634436e-06, + "loss": 0.2035, + "num_tokens": 3162567169.0, + "step": 4144 + }, + { + "epoch": 5.655478860166643, + "grad_norm": 0.22367918593385025, + "learning_rate": 5.764774153792374e-06, + "loss": 0.2074, + "num_tokens": 3163366386.0, + "step": 4145 + }, + { + "epoch": 5.656844441125324, + "grad_norm": 0.22538655218840165, + "learning_rate": 5.760731596990992e-06, + "loss": 0.2126, + "num_tokens": 3164158679.0, + "step": 4146 + }, + { + "epoch": 5.658210022084004, + "grad_norm": 0.21872261798512013, + "learning_rate": 5.756690638463512e-06, + "loss": 0.2052, + "num_tokens": 3164915297.0, + "step": 4147 + }, + { + "epoch": 5.659575603042685, + "grad_norm": 0.2201534585509166, + "learning_rate": 5.752651279442647e-06, + "loss": 0.2059, + "num_tokens": 3165729861.0, + "step": 4148 + }, + { + "epoch": 5.6609411840013655, + "grad_norm": 0.21724150892351443, + "learning_rate": 5.7486135211606344e-06, + "loss": 0.204, + "num_tokens": 3166450051.0, + "step": 4149 + }, + { + "epoch": 5.6623067649600465, + "grad_norm": 0.22277488222355468, + "learning_rate": 5.744577364849222e-06, + "loss": 0.2087, + "num_tokens": 3167189343.0, + "step": 4150 + }, + { + "epoch": 5.663672345918727, + "grad_norm": 0.20412197006633614, + "learning_rate": 5.740542811739658e-06, + "loss": 0.2009, + "num_tokens": 3168016654.0, + "step": 4151 + }, + { + "epoch": 5.665037926877407, + "grad_norm": 0.2114148128753542, + "learning_rate": 5.736509863062713e-06, + "loss": 0.1996, + "num_tokens": 3168769333.0, + "step": 4152 + }, + { + "epoch": 5.666403507836088, + "grad_norm": 0.20578731270148976, + "learning_rate": 5.7324785200486686e-06, + "loss": 0.2078, + "num_tokens": 3169567851.0, + "step": 4153 + }, + { + "epoch": 5.667769088794768, + "grad_norm": 0.2242712683466558, + "learning_rate": 5.7284487839273136e-06, + "loss": 0.2085, + "num_tokens": 3170285210.0, + "step": 4154 + }, + { + "epoch": 5.669134669753449, + "grad_norm": 0.21246609675287478, + "learning_rate": 5.7244206559279405e-06, + "loss": 0.2094, + "num_tokens": 3171021253.0, + "step": 4155 + }, + { + "epoch": 5.670500250712129, + "grad_norm": 0.2222846227687881, + "learning_rate": 5.72039413727936e-06, + "loss": 0.2124, + "num_tokens": 3171781660.0, + "step": 4156 + }, + { + "epoch": 5.67186583167081, + "grad_norm": 0.22235257434960415, + "learning_rate": 5.716369229209885e-06, + "loss": 0.2081, + "num_tokens": 3172620979.0, + "step": 4157 + }, + { + "epoch": 5.67323141262949, + "grad_norm": 0.2166059863737499, + "learning_rate": 5.712345932947345e-06, + "loss": 0.2033, + "num_tokens": 3173336689.0, + "step": 4158 + }, + { + "epoch": 5.674596993588171, + "grad_norm": 0.21595549040077594, + "learning_rate": 5.708324249719077e-06, + "loss": 0.2076, + "num_tokens": 3174085565.0, + "step": 4159 + }, + { + "epoch": 5.675962574546851, + "grad_norm": 0.22280020610484227, + "learning_rate": 5.704304180751912e-06, + "loss": 0.2108, + "num_tokens": 3174782401.0, + "step": 4160 + }, + { + "epoch": 5.677328155505531, + "grad_norm": 0.22113022810331445, + "learning_rate": 5.700285727272206e-06, + "loss": 0.2091, + "num_tokens": 3175571919.0, + "step": 4161 + }, + { + "epoch": 5.678693736464212, + "grad_norm": 0.20066665226316344, + "learning_rate": 5.696268890505814e-06, + "loss": 0.2046, + "num_tokens": 3176314937.0, + "step": 4162 + }, + { + "epoch": 5.680059317422892, + "grad_norm": 0.2120071095809977, + "learning_rate": 5.692253671678097e-06, + "loss": 0.2118, + "num_tokens": 3177094054.0, + "step": 4163 + }, + { + "epoch": 5.681424898381573, + "grad_norm": 0.26009342116537454, + "learning_rate": 5.688240072013927e-06, + "loss": 0.2076, + "num_tokens": 3177757331.0, + "step": 4164 + }, + { + "epoch": 5.6827904793402535, + "grad_norm": 0.21127825977018833, + "learning_rate": 5.6842280927376755e-06, + "loss": 0.2009, + "num_tokens": 3178489502.0, + "step": 4165 + }, + { + "epoch": 5.6841560602989345, + "grad_norm": 0.2251269368365144, + "learning_rate": 5.680217735073228e-06, + "loss": 0.2045, + "num_tokens": 3179243334.0, + "step": 4166 + }, + { + "epoch": 5.685521641257615, + "grad_norm": 0.20957623872930387, + "learning_rate": 5.6762090002439705e-06, + "loss": 0.2051, + "num_tokens": 3180018285.0, + "step": 4167 + }, + { + "epoch": 5.686887222216296, + "grad_norm": 0.21696187391291002, + "learning_rate": 5.672201889472795e-06, + "loss": 0.2124, + "num_tokens": 3180757264.0, + "step": 4168 + }, + { + "epoch": 5.688252803174976, + "grad_norm": 0.21203275741145236, + "learning_rate": 5.668196403982092e-06, + "loss": 0.2115, + "num_tokens": 3181566912.0, + "step": 4169 + }, + { + "epoch": 5.689618384133656, + "grad_norm": 0.2089208089227541, + "learning_rate": 5.664192544993766e-06, + "loss": 0.2023, + "num_tokens": 3182335422.0, + "step": 4170 + }, + { + "epoch": 5.690983965092337, + "grad_norm": 0.22108277116048275, + "learning_rate": 5.660190313729224e-06, + "loss": 0.2016, + "num_tokens": 3183039877.0, + "step": 4171 + }, + { + "epoch": 5.692349546051017, + "grad_norm": 0.21849179643887834, + "learning_rate": 5.6561897114093666e-06, + "loss": 0.2073, + "num_tokens": 3183803075.0, + "step": 4172 + }, + { + "epoch": 5.693715127009698, + "grad_norm": 0.21827303922312224, + "learning_rate": 5.652190739254608e-06, + "loss": 0.2099, + "num_tokens": 3184577701.0, + "step": 4173 + }, + { + "epoch": 5.695080707968378, + "grad_norm": 0.21364991220070445, + "learning_rate": 5.648193398484859e-06, + "loss": 0.2025, + "num_tokens": 3185303235.0, + "step": 4174 + }, + { + "epoch": 5.696446288927059, + "grad_norm": 0.2169093851388293, + "learning_rate": 5.64419769031954e-06, + "loss": 0.2054, + "num_tokens": 3186082597.0, + "step": 4175 + }, + { + "epoch": 5.697811869885739, + "grad_norm": 0.21867667813899136, + "learning_rate": 5.640203615977562e-06, + "loss": 0.2086, + "num_tokens": 3186984541.0, + "step": 4176 + }, + { + "epoch": 5.69917745084442, + "grad_norm": 0.22637487156415106, + "learning_rate": 5.636211176677353e-06, + "loss": 0.2099, + "num_tokens": 3187802906.0, + "step": 4177 + }, + { + "epoch": 5.7005430318031, + "grad_norm": 0.20940862193057339, + "learning_rate": 5.632220373636825e-06, + "loss": 0.2119, + "num_tokens": 3188534065.0, + "step": 4178 + }, + { + "epoch": 5.70190861276178, + "grad_norm": 0.21991412790333378, + "learning_rate": 5.628231208073402e-06, + "loss": 0.2082, + "num_tokens": 3189333439.0, + "step": 4179 + }, + { + "epoch": 5.703274193720461, + "grad_norm": 0.2198151247205235, + "learning_rate": 5.624243681204007e-06, + "loss": 0.2095, + "num_tokens": 3190031977.0, + "step": 4180 + }, + { + "epoch": 5.7046397746791415, + "grad_norm": 0.209179104590826, + "learning_rate": 5.620257794245056e-06, + "loss": 0.2002, + "num_tokens": 3190777536.0, + "step": 4181 + }, + { + "epoch": 5.7060053556378225, + "grad_norm": 0.22203526909844284, + "learning_rate": 5.616273548412483e-06, + "loss": 0.2083, + "num_tokens": 3191564273.0, + "step": 4182 + }, + { + "epoch": 5.707370936596503, + "grad_norm": 0.22328209761728995, + "learning_rate": 5.612290944921697e-06, + "loss": 0.2105, + "num_tokens": 3192287475.0, + "step": 4183 + }, + { + "epoch": 5.708736517555184, + "grad_norm": 0.2318235439122243, + "learning_rate": 5.60830998498762e-06, + "loss": 0.2041, + "num_tokens": 3193030960.0, + "step": 4184 + }, + { + "epoch": 5.710102098513864, + "grad_norm": 0.20793962082929957, + "learning_rate": 5.604330669824674e-06, + "loss": 0.2007, + "num_tokens": 3193740604.0, + "step": 4185 + }, + { + "epoch": 5.711467679472545, + "grad_norm": 0.2251826900040625, + "learning_rate": 5.600353000646777e-06, + "loss": 0.2091, + "num_tokens": 3194527109.0, + "step": 4186 + }, + { + "epoch": 5.712833260431225, + "grad_norm": 0.3221795357174082, + "learning_rate": 5.596376978667337e-06, + "loss": 0.2012, + "num_tokens": 3195227991.0, + "step": 4187 + }, + { + "epoch": 5.714198841389905, + "grad_norm": 0.23184050821634009, + "learning_rate": 5.59240260509927e-06, + "loss": 0.2041, + "num_tokens": 3195999454.0, + "step": 4188 + }, + { + "epoch": 5.715564422348586, + "grad_norm": 0.2220343822679678, + "learning_rate": 5.588429881154988e-06, + "loss": 0.2051, + "num_tokens": 3196727210.0, + "step": 4189 + }, + { + "epoch": 5.716930003307266, + "grad_norm": 0.2229491003925695, + "learning_rate": 5.584458808046389e-06, + "loss": 0.2136, + "num_tokens": 3197448777.0, + "step": 4190 + }, + { + "epoch": 5.718295584265947, + "grad_norm": 0.22221143464155388, + "learning_rate": 5.580489386984886e-06, + "loss": 0.2108, + "num_tokens": 3198213864.0, + "step": 4191 + }, + { + "epoch": 5.719661165224627, + "grad_norm": 0.22029028439239534, + "learning_rate": 5.576521619181369e-06, + "loss": 0.2176, + "num_tokens": 3198997536.0, + "step": 4192 + }, + { + "epoch": 5.721026746183308, + "grad_norm": 0.2111173960816002, + "learning_rate": 5.5725555058462355e-06, + "loss": 0.2034, + "num_tokens": 3199745049.0, + "step": 4193 + }, + { + "epoch": 5.722392327141988, + "grad_norm": 0.2118995712731698, + "learning_rate": 5.5685910481893755e-06, + "loss": 0.2098, + "num_tokens": 3200486957.0, + "step": 4194 + }, + { + "epoch": 5.723757908100669, + "grad_norm": 0.21479371609848036, + "learning_rate": 5.564628247420172e-06, + "loss": 0.1996, + "num_tokens": 3201174487.0, + "step": 4195 + }, + { + "epoch": 5.725123489059349, + "grad_norm": 0.22105228961256235, + "learning_rate": 5.560667104747502e-06, + "loss": 0.2046, + "num_tokens": 3201903970.0, + "step": 4196 + }, + { + "epoch": 5.7264890700180295, + "grad_norm": 0.20705616933556797, + "learning_rate": 5.556707621379742e-06, + "loss": 0.2083, + "num_tokens": 3202700110.0, + "step": 4197 + }, + { + "epoch": 5.7278546509767105, + "grad_norm": 0.221517188532402, + "learning_rate": 5.552749798524761e-06, + "loss": 0.2071, + "num_tokens": 3203532829.0, + "step": 4198 + }, + { + "epoch": 5.729220231935391, + "grad_norm": 0.2057744763005639, + "learning_rate": 5.548793637389911e-06, + "loss": 0.2098, + "num_tokens": 3204296785.0, + "step": 4199 + }, + { + "epoch": 5.730585812894072, + "grad_norm": 0.22731729165470155, + "learning_rate": 5.544839139182054e-06, + "loss": 0.2076, + "num_tokens": 3205011171.0, + "step": 4200 + }, + { + "epoch": 5.731951393852752, + "grad_norm": 0.22728791776817003, + "learning_rate": 5.540886305107535e-06, + "loss": 0.2005, + "num_tokens": 3205748357.0, + "step": 4201 + }, + { + "epoch": 5.733316974811433, + "grad_norm": 0.21835099834136748, + "learning_rate": 5.536935136372184e-06, + "loss": 0.2045, + "num_tokens": 3206451879.0, + "step": 4202 + }, + { + "epoch": 5.734682555770113, + "grad_norm": 0.21171533568998757, + "learning_rate": 5.5329856341813425e-06, + "loss": 0.1946, + "num_tokens": 3207249454.0, + "step": 4203 + }, + { + "epoch": 5.736048136728794, + "grad_norm": 0.20700682995359845, + "learning_rate": 5.529037799739825e-06, + "loss": 0.203, + "num_tokens": 3208014832.0, + "step": 4204 + }, + { + "epoch": 5.737413717687474, + "grad_norm": 0.21724911177983613, + "learning_rate": 5.525091634251946e-06, + "loss": 0.2174, + "num_tokens": 3208780386.0, + "step": 4205 + }, + { + "epoch": 5.738779298646154, + "grad_norm": 0.23799172623053194, + "learning_rate": 5.521147138921514e-06, + "loss": 0.2127, + "num_tokens": 3209521371.0, + "step": 4206 + }, + { + "epoch": 5.740144879604835, + "grad_norm": 0.21193376313856313, + "learning_rate": 5.517204314951824e-06, + "loss": 0.2092, + "num_tokens": 3210274446.0, + "step": 4207 + }, + { + "epoch": 5.741510460563515, + "grad_norm": 0.20563248358199748, + "learning_rate": 5.51326316354565e-06, + "loss": 0.2003, + "num_tokens": 3211058488.0, + "step": 4208 + }, + { + "epoch": 5.742876041522196, + "grad_norm": 0.21617990396044937, + "learning_rate": 5.509323685905281e-06, + "loss": 0.2061, + "num_tokens": 3211836131.0, + "step": 4209 + }, + { + "epoch": 5.744241622480876, + "grad_norm": 0.2086093815405659, + "learning_rate": 5.505385883232476e-06, + "loss": 0.2043, + "num_tokens": 3212598231.0, + "step": 4210 + }, + { + "epoch": 5.745607203439557, + "grad_norm": 0.23242601937821505, + "learning_rate": 5.501449756728479e-06, + "loss": 0.2087, + "num_tokens": 3213297670.0, + "step": 4211 + }, + { + "epoch": 5.746972784398237, + "grad_norm": 0.2095644538763236, + "learning_rate": 5.497515307594045e-06, + "loss": 0.2067, + "num_tokens": 3214129169.0, + "step": 4212 + }, + { + "epoch": 5.7483383653569184, + "grad_norm": 0.2085374176410146, + "learning_rate": 5.493582537029396e-06, + "loss": 0.218, + "num_tokens": 3214912290.0, + "step": 4213 + }, + { + "epoch": 5.749703946315599, + "grad_norm": 0.22304254700386042, + "learning_rate": 5.4896514462342525e-06, + "loss": 0.209, + "num_tokens": 3215690671.0, + "step": 4214 + }, + { + "epoch": 5.751069527274279, + "grad_norm": 0.21195146778446572, + "learning_rate": 5.485722036407819e-06, + "loss": 0.2114, + "num_tokens": 3216452994.0, + "step": 4215 + }, + { + "epoch": 5.75243510823296, + "grad_norm": 0.216456612300976, + "learning_rate": 5.481794308748792e-06, + "loss": 0.2115, + "num_tokens": 3217209036.0, + "step": 4216 + }, + { + "epoch": 5.75380068919164, + "grad_norm": 0.22097617692338534, + "learning_rate": 5.477868264455346e-06, + "loss": 0.2056, + "num_tokens": 3217964047.0, + "step": 4217 + }, + { + "epoch": 5.755166270150321, + "grad_norm": 0.20455618307805526, + "learning_rate": 5.473943904725149e-06, + "loss": 0.2098, + "num_tokens": 3218753988.0, + "step": 4218 + }, + { + "epoch": 5.756531851109001, + "grad_norm": 0.20380263317187766, + "learning_rate": 5.470021230755358e-06, + "loss": 0.2179, + "num_tokens": 3219525105.0, + "step": 4219 + }, + { + "epoch": 5.757897432067682, + "grad_norm": 0.23241617806064577, + "learning_rate": 5.466100243742598e-06, + "loss": 0.2086, + "num_tokens": 3220272817.0, + "step": 4220 + }, + { + "epoch": 5.759263013026362, + "grad_norm": 0.19719624748270617, + "learning_rate": 5.4621809448830084e-06, + "loss": 0.2131, + "num_tokens": 3221050873.0, + "step": 4221 + }, + { + "epoch": 5.760628593985043, + "grad_norm": 0.21722937780254506, + "learning_rate": 5.458263335372186e-06, + "loss": 0.2085, + "num_tokens": 3221844959.0, + "step": 4222 + }, + { + "epoch": 5.761994174943723, + "grad_norm": 0.21838877853012525, + "learning_rate": 5.454347416405229e-06, + "loss": 0.2023, + "num_tokens": 3222581258.0, + "step": 4223 + }, + { + "epoch": 5.763359755902403, + "grad_norm": 0.21131473470635287, + "learning_rate": 5.450433189176712e-06, + "loss": 0.2058, + "num_tokens": 3223325447.0, + "step": 4224 + }, + { + "epoch": 5.764725336861084, + "grad_norm": 0.21723025573530222, + "learning_rate": 5.446520654880702e-06, + "loss": 0.2102, + "num_tokens": 3224034726.0, + "step": 4225 + }, + { + "epoch": 5.766090917819764, + "grad_norm": 0.21843710634020033, + "learning_rate": 5.442609814710736e-06, + "loss": 0.2076, + "num_tokens": 3224788015.0, + "step": 4226 + }, + { + "epoch": 5.767456498778445, + "grad_norm": 0.2047665164258604, + "learning_rate": 5.438700669859844e-06, + "loss": 0.2065, + "num_tokens": 3225622323.0, + "step": 4227 + }, + { + "epoch": 5.7688220797371255, + "grad_norm": 0.20442896012783388, + "learning_rate": 5.434793221520542e-06, + "loss": 0.2063, + "num_tokens": 3226453591.0, + "step": 4228 + }, + { + "epoch": 5.7701876606958065, + "grad_norm": 0.21252747574411618, + "learning_rate": 5.4308874708848145e-06, + "loss": 0.2081, + "num_tokens": 3227226684.0, + "step": 4229 + }, + { + "epoch": 5.771553241654487, + "grad_norm": 0.23583610102433405, + "learning_rate": 5.426983419144148e-06, + "loss": 0.2077, + "num_tokens": 3228114046.0, + "step": 4230 + }, + { + "epoch": 5.772918822613168, + "grad_norm": 0.19979481668525254, + "learning_rate": 5.42308106748949e-06, + "loss": 0.194, + "num_tokens": 3228849217.0, + "step": 4231 + }, + { + "epoch": 5.774284403571848, + "grad_norm": 0.21076584079899705, + "learning_rate": 5.419180417111282e-06, + "loss": 0.2, + "num_tokens": 3229527686.0, + "step": 4232 + }, + { + "epoch": 5.775649984530528, + "grad_norm": 0.21924347734212937, + "learning_rate": 5.415281469199447e-06, + "loss": 0.203, + "num_tokens": 3230322181.0, + "step": 4233 + }, + { + "epoch": 5.777015565489209, + "grad_norm": 0.2135158821265052, + "learning_rate": 5.41138422494338e-06, + "loss": 0.208, + "num_tokens": 3231087725.0, + "step": 4234 + }, + { + "epoch": 5.778381146447889, + "grad_norm": 0.226938044962691, + "learning_rate": 5.407488685531963e-06, + "loss": 0.2071, + "num_tokens": 3231798444.0, + "step": 4235 + }, + { + "epoch": 5.77974672740657, + "grad_norm": 0.19839179528987155, + "learning_rate": 5.4035948521535575e-06, + "loss": 0.2202, + "num_tokens": 3232640034.0, + "step": 4236 + }, + { + "epoch": 5.78111230836525, + "grad_norm": 0.21473698981998282, + "learning_rate": 5.399702725996001e-06, + "loss": 0.2105, + "num_tokens": 3233467000.0, + "step": 4237 + }, + { + "epoch": 5.782477889323931, + "grad_norm": 0.22757290814232478, + "learning_rate": 5.395812308246617e-06, + "loss": 0.2148, + "num_tokens": 3234204552.0, + "step": 4238 + }, + { + "epoch": 5.783843470282611, + "grad_norm": 0.2119407282937037, + "learning_rate": 5.391923600092204e-06, + "loss": 0.2001, + "num_tokens": 3234914773.0, + "step": 4239 + }, + { + "epoch": 5.785209051241292, + "grad_norm": 0.21023216573991196, + "learning_rate": 5.388036602719031e-06, + "loss": 0.2018, + "num_tokens": 3235690965.0, + "step": 4240 + }, + { + "epoch": 5.786574632199972, + "grad_norm": 0.21780525051768712, + "learning_rate": 5.384151317312858e-06, + "loss": 0.2072, + "num_tokens": 3236457810.0, + "step": 4241 + }, + { + "epoch": 5.787940213158652, + "grad_norm": 0.20861535845922327, + "learning_rate": 5.380267745058918e-06, + "loss": 0.2034, + "num_tokens": 3237110740.0, + "step": 4242 + }, + { + "epoch": 5.789305794117333, + "grad_norm": 0.21666499170770342, + "learning_rate": 5.376385887141917e-06, + "loss": 0.2117, + "num_tokens": 3237857545.0, + "step": 4243 + }, + { + "epoch": 5.7906713750760135, + "grad_norm": 0.23865192407785457, + "learning_rate": 5.372505744746044e-06, + "loss": 0.1983, + "num_tokens": 3238588908.0, + "step": 4244 + }, + { + "epoch": 5.7920369560346945, + "grad_norm": 0.20190108714219912, + "learning_rate": 5.368627319054962e-06, + "loss": 0.2055, + "num_tokens": 3239333447.0, + "step": 4245 + }, + { + "epoch": 5.793402536993375, + "grad_norm": 0.20648191953908449, + "learning_rate": 5.36475061125181e-06, + "loss": 0.1995, + "num_tokens": 3240089534.0, + "step": 4246 + }, + { + "epoch": 5.794768117952056, + "grad_norm": 0.21183723035387533, + "learning_rate": 5.360875622519206e-06, + "loss": 0.2098, + "num_tokens": 3240867072.0, + "step": 4247 + }, + { + "epoch": 5.796133698910736, + "grad_norm": 0.21704285914067947, + "learning_rate": 5.357002354039243e-06, + "loss": 0.204, + "num_tokens": 3241549888.0, + "step": 4248 + }, + { + "epoch": 5.797499279869417, + "grad_norm": 0.21899926988477267, + "learning_rate": 5.3531308069934805e-06, + "loss": 0.2085, + "num_tokens": 3242338907.0, + "step": 4249 + }, + { + "epoch": 5.798864860828097, + "grad_norm": 0.21654837110254418, + "learning_rate": 5.349260982562963e-06, + "loss": 0.1983, + "num_tokens": 3243071888.0, + "step": 4250 + }, + { + "epoch": 5.800230441786777, + "grad_norm": 0.22214505598588327, + "learning_rate": 5.3453928819282105e-06, + "loss": 0.1998, + "num_tokens": 3243797760.0, + "step": 4251 + }, + { + "epoch": 5.801596022745458, + "grad_norm": 0.21917820400216237, + "learning_rate": 5.341526506269208e-06, + "loss": 0.1978, + "num_tokens": 3244515569.0, + "step": 4252 + }, + { + "epoch": 5.802961603704138, + "grad_norm": 0.24734326250469496, + "learning_rate": 5.337661856765419e-06, + "loss": 0.2022, + "num_tokens": 3245213256.0, + "step": 4253 + }, + { + "epoch": 5.804327184662819, + "grad_norm": 0.22286648804488418, + "learning_rate": 5.333798934595784e-06, + "loss": 0.2154, + "num_tokens": 3245930182.0, + "step": 4254 + }, + { + "epoch": 5.805692765621499, + "grad_norm": 0.20967386384571665, + "learning_rate": 5.329937740938711e-06, + "loss": 0.2053, + "num_tokens": 3246765786.0, + "step": 4255 + }, + { + "epoch": 5.80705834658018, + "grad_norm": 0.2096688573858637, + "learning_rate": 5.3260782769720865e-06, + "loss": 0.2014, + "num_tokens": 3247518844.0, + "step": 4256 + }, + { + "epoch": 5.80842392753886, + "grad_norm": 0.22356079913806276, + "learning_rate": 5.32222054387326e-06, + "loss": 0.2116, + "num_tokens": 3248212248.0, + "step": 4257 + }, + { + "epoch": 5.809789508497541, + "grad_norm": 0.23357629409070074, + "learning_rate": 5.318364542819062e-06, + "loss": 0.2052, + "num_tokens": 3248924615.0, + "step": 4258 + }, + { + "epoch": 5.811155089456221, + "grad_norm": 0.22396863417081314, + "learning_rate": 5.314510274985792e-06, + "loss": 0.2037, + "num_tokens": 3249707347.0, + "step": 4259 + }, + { + "epoch": 5.8125206704149015, + "grad_norm": 0.2163136701415485, + "learning_rate": 5.310657741549221e-06, + "loss": 0.2041, + "num_tokens": 3250419921.0, + "step": 4260 + }, + { + "epoch": 5.8138862513735825, + "grad_norm": 0.22716725346773228, + "learning_rate": 5.306806943684586e-06, + "loss": 0.2089, + "num_tokens": 3251170917.0, + "step": 4261 + }, + { + "epoch": 5.815251832332263, + "grad_norm": 0.2141548579426943, + "learning_rate": 5.302957882566602e-06, + "loss": 0.2122, + "num_tokens": 3252012057.0, + "step": 4262 + }, + { + "epoch": 5.816617413290944, + "grad_norm": 0.2418110747203828, + "learning_rate": 5.299110559369449e-06, + "loss": 0.2138, + "num_tokens": 3252781977.0, + "step": 4263 + }, + { + "epoch": 5.817982994249624, + "grad_norm": 0.19964885661196688, + "learning_rate": 5.29526497526678e-06, + "loss": 0.2191, + "num_tokens": 3253555334.0, + "step": 4264 + }, + { + "epoch": 5.819348575208305, + "grad_norm": 0.21499519344453938, + "learning_rate": 5.291421131431721e-06, + "loss": 0.205, + "num_tokens": 3254276471.0, + "step": 4265 + }, + { + "epoch": 5.820714156166985, + "grad_norm": 0.2190296438373196, + "learning_rate": 5.287579029036854e-06, + "loss": 0.2084, + "num_tokens": 3255028291.0, + "step": 4266 + }, + { + "epoch": 5.822079737125666, + "grad_norm": 0.20955966638409798, + "learning_rate": 5.283738669254239e-06, + "loss": 0.2083, + "num_tokens": 3255776714.0, + "step": 4267 + }, + { + "epoch": 5.823445318084346, + "grad_norm": 0.20802750826421257, + "learning_rate": 5.279900053255407e-06, + "loss": 0.2144, + "num_tokens": 3256567695.0, + "step": 4268 + }, + { + "epoch": 5.824810899043026, + "grad_norm": 0.22619374530979092, + "learning_rate": 5.276063182211355e-06, + "loss": 0.2097, + "num_tokens": 3257350178.0, + "step": 4269 + }, + { + "epoch": 5.826176480001707, + "grad_norm": 0.2133765418911571, + "learning_rate": 5.272228057292537e-06, + "loss": 0.211, + "num_tokens": 3258108844.0, + "step": 4270 + }, + { + "epoch": 5.827542060960387, + "grad_norm": 0.21388621500752694, + "learning_rate": 5.268394679668898e-06, + "loss": 0.2034, + "num_tokens": 3258862161.0, + "step": 4271 + }, + { + "epoch": 5.828907641919068, + "grad_norm": 0.21503194365858638, + "learning_rate": 5.264563050509826e-06, + "loss": 0.2132, + "num_tokens": 3259649546.0, + "step": 4272 + }, + { + "epoch": 5.830273222877748, + "grad_norm": 0.22044295599093336, + "learning_rate": 5.26073317098418e-06, + "loss": 0.1994, + "num_tokens": 3260378620.0, + "step": 4273 + }, + { + "epoch": 5.831638803836429, + "grad_norm": 0.21109563817559718, + "learning_rate": 5.256905042260303e-06, + "loss": 0.2082, + "num_tokens": 3261203666.0, + "step": 4274 + }, + { + "epoch": 5.833004384795109, + "grad_norm": 0.21574876191322387, + "learning_rate": 5.2530786655059825e-06, + "loss": 0.211, + "num_tokens": 3261949136.0, + "step": 4275 + }, + { + "epoch": 5.83436996575379, + "grad_norm": 0.21317963019101357, + "learning_rate": 5.249254041888485e-06, + "loss": 0.2049, + "num_tokens": 3262700366.0, + "step": 4276 + }, + { + "epoch": 5.8357355467124705, + "grad_norm": 0.22040899052544635, + "learning_rate": 5.245431172574533e-06, + "loss": 0.2006, + "num_tokens": 3263438965.0, + "step": 4277 + }, + { + "epoch": 5.837101127671151, + "grad_norm": 0.22883335640467845, + "learning_rate": 5.241610058730327e-06, + "loss": 0.2064, + "num_tokens": 3264120668.0, + "step": 4278 + }, + { + "epoch": 5.838466708629832, + "grad_norm": 0.2136898359493668, + "learning_rate": 5.237790701521512e-06, + "loss": 0.209, + "num_tokens": 3264886474.0, + "step": 4279 + }, + { + "epoch": 5.839832289588512, + "grad_norm": 0.21613962742259807, + "learning_rate": 5.233973102113215e-06, + "loss": 0.2126, + "num_tokens": 3265675330.0, + "step": 4280 + }, + { + "epoch": 5.841197870547193, + "grad_norm": 0.20326860189631368, + "learning_rate": 5.230157261670023e-06, + "loss": 0.2025, + "num_tokens": 3266519191.0, + "step": 4281 + }, + { + "epoch": 5.842563451505873, + "grad_norm": 0.2126056708207295, + "learning_rate": 5.226343181355972e-06, + "loss": 0.2143, + "num_tokens": 3267309371.0, + "step": 4282 + }, + { + "epoch": 5.843929032464554, + "grad_norm": 0.21967408916261885, + "learning_rate": 5.222530862334587e-06, + "loss": 0.2097, + "num_tokens": 3268077241.0, + "step": 4283 + }, + { + "epoch": 5.845294613423234, + "grad_norm": 0.21069315680177833, + "learning_rate": 5.2187203057688314e-06, + "loss": 0.1934, + "num_tokens": 3268749832.0, + "step": 4284 + }, + { + "epoch": 5.846660194381915, + "grad_norm": 0.2211461826018855, + "learning_rate": 5.214911512821146e-06, + "loss": 0.2049, + "num_tokens": 3269447963.0, + "step": 4285 + }, + { + "epoch": 5.848025775340595, + "grad_norm": 0.21130044451007018, + "learning_rate": 5.211104484653427e-06, + "loss": 0.2144, + "num_tokens": 3270187708.0, + "step": 4286 + }, + { + "epoch": 5.849391356299275, + "grad_norm": 0.2341511752425816, + "learning_rate": 5.207299222427037e-06, + "loss": 0.2031, + "num_tokens": 3270874520.0, + "step": 4287 + }, + { + "epoch": 5.850756937257956, + "grad_norm": 0.208967743256596, + "learning_rate": 5.203495727302789e-06, + "loss": 0.1945, + "num_tokens": 3271553203.0, + "step": 4288 + }, + { + "epoch": 5.852122518216636, + "grad_norm": 0.21664586199560606, + "learning_rate": 5.199694000440971e-06, + "loss": 0.2037, + "num_tokens": 3272304379.0, + "step": 4289 + }, + { + "epoch": 5.853488099175317, + "grad_norm": 0.21460257494661653, + "learning_rate": 5.1958940430013275e-06, + "loss": 0.2011, + "num_tokens": 3273074280.0, + "step": 4290 + }, + { + "epoch": 5.854853680133997, + "grad_norm": 0.22888846940778182, + "learning_rate": 5.192095856143049e-06, + "loss": 0.2009, + "num_tokens": 3273777879.0, + "step": 4291 + }, + { + "epoch": 5.856219261092678, + "grad_norm": 0.20150240069647052, + "learning_rate": 5.188299441024812e-06, + "loss": 0.2116, + "num_tokens": 3274547114.0, + "step": 4292 + }, + { + "epoch": 5.8575848420513585, + "grad_norm": 0.23367448929096357, + "learning_rate": 5.18450479880473e-06, + "loss": 0.2078, + "num_tokens": 3275295259.0, + "step": 4293 + }, + { + "epoch": 5.8589504230100395, + "grad_norm": 0.20692345283129254, + "learning_rate": 5.180711930640387e-06, + "loss": 0.2008, + "num_tokens": 3276069554.0, + "step": 4294 + }, + { + "epoch": 5.86031600396872, + "grad_norm": 0.21162434163948513, + "learning_rate": 5.176920837688824e-06, + "loss": 0.2033, + "num_tokens": 3276838246.0, + "step": 4295 + }, + { + "epoch": 5.8616815849274, + "grad_norm": 0.21938280387218573, + "learning_rate": 5.173131521106534e-06, + "loss": 0.2073, + "num_tokens": 3277561695.0, + "step": 4296 + }, + { + "epoch": 5.863047165886081, + "grad_norm": 0.20825052217912057, + "learning_rate": 5.169343982049476e-06, + "loss": 0.2181, + "num_tokens": 3278396040.0, + "step": 4297 + }, + { + "epoch": 5.864412746844761, + "grad_norm": 0.22920460866176232, + "learning_rate": 5.165558221673066e-06, + "loss": 0.2194, + "num_tokens": 3279124842.0, + "step": 4298 + }, + { + "epoch": 5.865778327803442, + "grad_norm": 0.21908621964082123, + "learning_rate": 5.161774241132178e-06, + "loss": 0.2046, + "num_tokens": 3279822553.0, + "step": 4299 + }, + { + "epoch": 5.867143908762122, + "grad_norm": 0.23915925181085307, + "learning_rate": 5.1579920415811305e-06, + "loss": 0.2064, + "num_tokens": 3280533997.0, + "step": 4300 + }, + { + "epoch": 5.868509489720803, + "grad_norm": 0.2218596843839502, + "learning_rate": 5.154211624173721e-06, + "loss": 0.2059, + "num_tokens": 3281247310.0, + "step": 4301 + }, + { + "epoch": 5.869875070679483, + "grad_norm": 0.20714650474792562, + "learning_rate": 5.150432990063181e-06, + "loss": 0.2005, + "num_tokens": 3282052033.0, + "step": 4302 + }, + { + "epoch": 5.871240651638164, + "grad_norm": 0.21281327476629777, + "learning_rate": 5.146656140402214e-06, + "loss": 0.2063, + "num_tokens": 3282753732.0, + "step": 4303 + }, + { + "epoch": 5.872606232596844, + "grad_norm": 0.2192777345980949, + "learning_rate": 5.142881076342976e-06, + "loss": 0.1986, + "num_tokens": 3283501176.0, + "step": 4304 + }, + { + "epoch": 5.873971813555524, + "grad_norm": 0.20388940086389573, + "learning_rate": 5.139107799037067e-06, + "loss": 0.2068, + "num_tokens": 3284238988.0, + "step": 4305 + }, + { + "epoch": 5.875337394514205, + "grad_norm": 0.20915427658768287, + "learning_rate": 5.135336309635554e-06, + "loss": 0.2017, + "num_tokens": 3285067399.0, + "step": 4306 + }, + { + "epoch": 5.8767029754728854, + "grad_norm": 0.19964095568486448, + "learning_rate": 5.131566609288956e-06, + "loss": 0.202, + "num_tokens": 3285873888.0, + "step": 4307 + }, + { + "epoch": 5.8780685564315664, + "grad_norm": 0.20402011796763142, + "learning_rate": 5.12779869914725e-06, + "loss": 0.205, + "num_tokens": 3286664345.0, + "step": 4308 + }, + { + "epoch": 5.879434137390247, + "grad_norm": 0.20699272489509823, + "learning_rate": 5.12403258035985e-06, + "loss": 0.208, + "num_tokens": 3287406288.0, + "step": 4309 + }, + { + "epoch": 5.880799718348928, + "grad_norm": 0.23332732758633432, + "learning_rate": 5.120268254075651e-06, + "loss": 0.2104, + "num_tokens": 3288196384.0, + "step": 4310 + }, + { + "epoch": 5.882165299307608, + "grad_norm": 0.211704477149572, + "learning_rate": 5.116505721442973e-06, + "loss": 0.2059, + "num_tokens": 3288946042.0, + "step": 4311 + }, + { + "epoch": 5.883530880266289, + "grad_norm": 0.20688819191061014, + "learning_rate": 5.1127449836096075e-06, + "loss": 0.2097, + "num_tokens": 3289677088.0, + "step": 4312 + }, + { + "epoch": 5.884896461224969, + "grad_norm": 0.21389439942296823, + "learning_rate": 5.1089860417227954e-06, + "loss": 0.1984, + "num_tokens": 3290449035.0, + "step": 4313 + }, + { + "epoch": 5.886262042183649, + "grad_norm": 0.2067227950024797, + "learning_rate": 5.105228896929222e-06, + "loss": 0.2113, + "num_tokens": 3291197208.0, + "step": 4314 + }, + { + "epoch": 5.88762762314233, + "grad_norm": 0.2082058742033818, + "learning_rate": 5.101473550375029e-06, + "loss": 0.2051, + "num_tokens": 3291904843.0, + "step": 4315 + }, + { + "epoch": 5.88899320410101, + "grad_norm": 0.21896873578723888, + "learning_rate": 5.097720003205813e-06, + "loss": 0.211, + "num_tokens": 3292651610.0, + "step": 4316 + }, + { + "epoch": 5.890358785059691, + "grad_norm": 0.2095118476267361, + "learning_rate": 5.093968256566623e-06, + "loss": 0.2041, + "num_tokens": 3293390492.0, + "step": 4317 + }, + { + "epoch": 5.891724366018371, + "grad_norm": 0.21219252260559157, + "learning_rate": 5.090218311601944e-06, + "loss": 0.2063, + "num_tokens": 3294147522.0, + "step": 4318 + }, + { + "epoch": 5.893089946977052, + "grad_norm": 0.2165786605823634, + "learning_rate": 5.086470169455728e-06, + "loss": 0.2005, + "num_tokens": 3294816452.0, + "step": 4319 + }, + { + "epoch": 5.894455527935732, + "grad_norm": 0.21132510784164318, + "learning_rate": 5.08272383127137e-06, + "loss": 0.2132, + "num_tokens": 3295650715.0, + "step": 4320 + }, + { + "epoch": 5.895821108894413, + "grad_norm": 0.23472334919192886, + "learning_rate": 5.078979298191717e-06, + "loss": 0.212, + "num_tokens": 3296426697.0, + "step": 4321 + }, + { + "epoch": 5.897186689853093, + "grad_norm": 0.21325005362807478, + "learning_rate": 5.075236571359063e-06, + "loss": 0.2083, + "num_tokens": 3297169758.0, + "step": 4322 + }, + { + "epoch": 5.8985522708117735, + "grad_norm": 0.21817819755737483, + "learning_rate": 5.0714956519151506e-06, + "loss": 0.204, + "num_tokens": 3297918546.0, + "step": 4323 + }, + { + "epoch": 5.8999178517704545, + "grad_norm": 0.20837556125867127, + "learning_rate": 5.067756541001173e-06, + "loss": 0.2012, + "num_tokens": 3298741246.0, + "step": 4324 + }, + { + "epoch": 5.901283432729135, + "grad_norm": 0.21742721344862487, + "learning_rate": 5.06401923975777e-06, + "loss": 0.1952, + "num_tokens": 3299477437.0, + "step": 4325 + }, + { + "epoch": 5.902649013687816, + "grad_norm": 0.21705678009930437, + "learning_rate": 5.060283749325033e-06, + "loss": 0.2103, + "num_tokens": 3300209536.0, + "step": 4326 + }, + { + "epoch": 5.904014594646496, + "grad_norm": 0.2167365092664306, + "learning_rate": 5.0565500708424974e-06, + "loss": 0.1974, + "num_tokens": 3300988187.0, + "step": 4327 + }, + { + "epoch": 5.905380175605177, + "grad_norm": 0.2145120820499253, + "learning_rate": 5.052818205449146e-06, + "loss": 0.2066, + "num_tokens": 3301695135.0, + "step": 4328 + }, + { + "epoch": 5.906745756563857, + "grad_norm": 0.21966608179895106, + "learning_rate": 5.049088154283408e-06, + "loss": 0.2043, + "num_tokens": 3302452563.0, + "step": 4329 + }, + { + "epoch": 5.908111337522538, + "grad_norm": 0.23560542723043307, + "learning_rate": 5.045359918483162e-06, + "loss": 0.213, + "num_tokens": 3303217423.0, + "step": 4330 + }, + { + "epoch": 5.909476918481218, + "grad_norm": 0.21777119466828512, + "learning_rate": 5.041633499185734e-06, + "loss": 0.2109, + "num_tokens": 3303971620.0, + "step": 4331 + }, + { + "epoch": 5.910842499439898, + "grad_norm": 0.21447875595395602, + "learning_rate": 5.037908897527888e-06, + "loss": 0.2046, + "num_tokens": 3304761851.0, + "step": 4332 + }, + { + "epoch": 5.912208080398579, + "grad_norm": 0.216226126931504, + "learning_rate": 5.034186114645842e-06, + "loss": 0.2074, + "num_tokens": 3305456057.0, + "step": 4333 + }, + { + "epoch": 5.913573661357259, + "grad_norm": 0.2119073996197028, + "learning_rate": 5.030465151675257e-06, + "loss": 0.2034, + "num_tokens": 3306251305.0, + "step": 4334 + }, + { + "epoch": 5.91493924231594, + "grad_norm": 0.2050809888935085, + "learning_rate": 5.0267460097512285e-06, + "loss": 0.204, + "num_tokens": 3306986725.0, + "step": 4335 + }, + { + "epoch": 5.91630482327462, + "grad_norm": 0.21379655028086933, + "learning_rate": 5.02302869000832e-06, + "loss": 0.2055, + "num_tokens": 3307713896.0, + "step": 4336 + }, + { + "epoch": 5.917670404233301, + "grad_norm": 0.2736571320246307, + "learning_rate": 5.019313193580515e-06, + "loss": 0.2102, + "num_tokens": 3308501007.0, + "step": 4337 + }, + { + "epoch": 5.919035985191981, + "grad_norm": 0.2068032644804702, + "learning_rate": 5.015599521601253e-06, + "loss": 0.2091, + "num_tokens": 3309187467.0, + "step": 4338 + }, + { + "epoch": 5.920401566150662, + "grad_norm": 0.2454308551382579, + "learning_rate": 5.0118876752034155e-06, + "loss": 0.2183, + "num_tokens": 3309956356.0, + "step": 4339 + }, + { + "epoch": 5.9217671471093425, + "grad_norm": 0.21683351929514097, + "learning_rate": 5.008177655519327e-06, + "loss": 0.209, + "num_tokens": 3310763853.0, + "step": 4340 + }, + { + "epoch": 5.923132728068023, + "grad_norm": 0.208237596357415, + "learning_rate": 5.004469463680748e-06, + "loss": 0.2068, + "num_tokens": 3311519765.0, + "step": 4341 + }, + { + "epoch": 5.924498309026704, + "grad_norm": 0.21057392268394162, + "learning_rate": 5.000763100818893e-06, + "loss": 0.2071, + "num_tokens": 3312340164.0, + "step": 4342 + }, + { + "epoch": 5.925863889985384, + "grad_norm": 0.22311089282393046, + "learning_rate": 4.997058568064413e-06, + "loss": 0.2102, + "num_tokens": 3313110884.0, + "step": 4343 + }, + { + "epoch": 5.927229470944065, + "grad_norm": 0.22005315544875517, + "learning_rate": 4.993355866547393e-06, + "loss": 0.2114, + "num_tokens": 3313869665.0, + "step": 4344 + }, + { + "epoch": 5.928595051902745, + "grad_norm": 0.21405059745728336, + "learning_rate": 4.989654997397379e-06, + "loss": 0.2065, + "num_tokens": 3314639717.0, + "step": 4345 + }, + { + "epoch": 5.929960632861426, + "grad_norm": 0.2096237706265529, + "learning_rate": 4.985955961743336e-06, + "loss": 0.198, + "num_tokens": 3315383057.0, + "step": 4346 + }, + { + "epoch": 5.931326213820106, + "grad_norm": 0.2343633381826909, + "learning_rate": 4.982258760713683e-06, + "loss": 0.2107, + "num_tokens": 3316161996.0, + "step": 4347 + }, + { + "epoch": 5.932691794778787, + "grad_norm": 0.22298258886413252, + "learning_rate": 4.978563395436276e-06, + "loss": 0.2082, + "num_tokens": 3316908831.0, + "step": 4348 + }, + { + "epoch": 5.934057375737467, + "grad_norm": 0.21266652114805817, + "learning_rate": 4.9748698670384134e-06, + "loss": 0.2045, + "num_tokens": 3317664998.0, + "step": 4349 + }, + { + "epoch": 5.935422956696147, + "grad_norm": 0.21011183094230917, + "learning_rate": 4.971178176646825e-06, + "loss": 0.2077, + "num_tokens": 3318394301.0, + "step": 4350 + }, + { + "epoch": 5.936788537654828, + "grad_norm": 0.20515052217632412, + "learning_rate": 4.96748832538769e-06, + "loss": 0.2069, + "num_tokens": 3319170978.0, + "step": 4351 + }, + { + "epoch": 5.938154118613508, + "grad_norm": 0.19689597779735668, + "learning_rate": 4.963800314386623e-06, + "loss": 0.2095, + "num_tokens": 3319939756.0, + "step": 4352 + }, + { + "epoch": 5.939519699572189, + "grad_norm": 0.20421433926975466, + "learning_rate": 4.960114144768669e-06, + "loss": 0.2104, + "num_tokens": 3320724501.0, + "step": 4353 + }, + { + "epoch": 5.940885280530869, + "grad_norm": 0.2158002980247675, + "learning_rate": 4.95642981765833e-06, + "loss": 0.2042, + "num_tokens": 3321457597.0, + "step": 4354 + }, + { + "epoch": 5.94225086148955, + "grad_norm": 0.22515287782829732, + "learning_rate": 4.952747334179524e-06, + "loss": 0.2094, + "num_tokens": 3322235945.0, + "step": 4355 + }, + { + "epoch": 5.9436164424482305, + "grad_norm": 0.20152186948059192, + "learning_rate": 4.949066695455625e-06, + "loss": 0.2068, + "num_tokens": 3323024156.0, + "step": 4356 + }, + { + "epoch": 5.9449820234069115, + "grad_norm": 0.2265016756462833, + "learning_rate": 4.945387902609432e-06, + "loss": 0.2177, + "num_tokens": 3323830175.0, + "step": 4357 + }, + { + "epoch": 5.946347604365592, + "grad_norm": 0.24756695049688368, + "learning_rate": 4.941710956763192e-06, + "loss": 0.2102, + "num_tokens": 3324575269.0, + "step": 4358 + }, + { + "epoch": 5.947713185324272, + "grad_norm": 0.2172319835909987, + "learning_rate": 4.938035859038571e-06, + "loss": 0.2198, + "num_tokens": 3325306604.0, + "step": 4359 + }, + { + "epoch": 5.949078766282953, + "grad_norm": 0.21984033945111656, + "learning_rate": 4.93436261055669e-06, + "loss": 0.2069, + "num_tokens": 3326101615.0, + "step": 4360 + }, + { + "epoch": 5.950444347241633, + "grad_norm": 0.21582582364257336, + "learning_rate": 4.930691212438098e-06, + "loss": 0.2057, + "num_tokens": 3326854259.0, + "step": 4361 + }, + { + "epoch": 5.951809928200314, + "grad_norm": 0.20869154878723692, + "learning_rate": 4.927021665802772e-06, + "loss": 0.21, + "num_tokens": 3327667602.0, + "step": 4362 + }, + { + "epoch": 5.953175509158994, + "grad_norm": 0.20894481123627578, + "learning_rate": 4.923353971770144e-06, + "loss": 0.203, + "num_tokens": 3328415257.0, + "step": 4363 + }, + { + "epoch": 5.954541090117675, + "grad_norm": 0.20850455373242263, + "learning_rate": 4.919688131459059e-06, + "loss": 0.199, + "num_tokens": 3329117520.0, + "step": 4364 + }, + { + "epoch": 5.955906671076355, + "grad_norm": 0.2136825546116263, + "learning_rate": 4.916024145987808e-06, + "loss": 0.2051, + "num_tokens": 3329912439.0, + "step": 4365 + }, + { + "epoch": 5.957272252035036, + "grad_norm": 0.21587236345558525, + "learning_rate": 4.91236201647412e-06, + "loss": 0.2104, + "num_tokens": 3330658866.0, + "step": 4366 + }, + { + "epoch": 5.958637832993716, + "grad_norm": 0.21723331080978672, + "learning_rate": 4.908701744035144e-06, + "loss": 0.2074, + "num_tokens": 3331435127.0, + "step": 4367 + }, + { + "epoch": 5.960003413952396, + "grad_norm": 0.21404275802035458, + "learning_rate": 4.905043329787474e-06, + "loss": 0.2071, + "num_tokens": 3332223902.0, + "step": 4368 + }, + { + "epoch": 5.961368994911077, + "grad_norm": 0.21991077929034253, + "learning_rate": 4.901386774847134e-06, + "loss": 0.2058, + "num_tokens": 3332997906.0, + "step": 4369 + }, + { + "epoch": 5.962734575869757, + "grad_norm": 0.20658458169205982, + "learning_rate": 4.897732080329583e-06, + "loss": 0.2113, + "num_tokens": 3333831393.0, + "step": 4370 + }, + { + "epoch": 5.964100156828438, + "grad_norm": 0.20871064801211173, + "learning_rate": 4.894079247349703e-06, + "loss": 0.2087, + "num_tokens": 3334587339.0, + "step": 4371 + }, + { + "epoch": 5.9654657377871185, + "grad_norm": 0.2033008717439575, + "learning_rate": 4.8904282770218244e-06, + "loss": 0.2044, + "num_tokens": 3335336624.0, + "step": 4372 + }, + { + "epoch": 5.9668313187457995, + "grad_norm": 0.21107413142514553, + "learning_rate": 4.886779170459695e-06, + "loss": 0.2071, + "num_tokens": 3336144643.0, + "step": 4373 + }, + { + "epoch": 5.96819689970448, + "grad_norm": 0.20511771734750694, + "learning_rate": 4.883131928776495e-06, + "loss": 0.2113, + "num_tokens": 3336921502.0, + "step": 4374 + }, + { + "epoch": 5.969562480663161, + "grad_norm": 0.21942976988756668, + "learning_rate": 4.879486553084849e-06, + "loss": 0.2053, + "num_tokens": 3337672822.0, + "step": 4375 + }, + { + "epoch": 5.970928061621841, + "grad_norm": 0.20087426256510466, + "learning_rate": 4.875843044496797e-06, + "loss": 0.2152, + "num_tokens": 3338517051.0, + "step": 4376 + }, + { + "epoch": 5.972293642580521, + "grad_norm": 0.21281511914740617, + "learning_rate": 4.872201404123818e-06, + "loss": 0.1988, + "num_tokens": 3339200990.0, + "step": 4377 + }, + { + "epoch": 5.973659223539202, + "grad_norm": 0.22724318734770663, + "learning_rate": 4.868561633076817e-06, + "loss": 0.1984, + "num_tokens": 3339932316.0, + "step": 4378 + }, + { + "epoch": 5.975024804497882, + "grad_norm": 0.21459370475345238, + "learning_rate": 4.864923732466135e-06, + "loss": 0.2129, + "num_tokens": 3340700745.0, + "step": 4379 + }, + { + "epoch": 5.976390385456563, + "grad_norm": 0.21449347135227145, + "learning_rate": 4.861287703401528e-06, + "loss": 0.2116, + "num_tokens": 3341448333.0, + "step": 4380 + }, + { + "epoch": 5.977755966415243, + "grad_norm": 0.2179243628190327, + "learning_rate": 4.857653546992205e-06, + "loss": 0.2016, + "num_tokens": 3342188770.0, + "step": 4381 + }, + { + "epoch": 5.979121547373924, + "grad_norm": 0.22408681844375217, + "learning_rate": 4.854021264346781e-06, + "loss": 0.1956, + "num_tokens": 3342827064.0, + "step": 4382 + }, + { + "epoch": 5.980487128332604, + "grad_norm": 0.22089757909248758, + "learning_rate": 4.850390856573304e-06, + "loss": 0.2034, + "num_tokens": 3343515813.0, + "step": 4383 + }, + { + "epoch": 5.981852709291285, + "grad_norm": 0.21937237180422786, + "learning_rate": 4.846762324779266e-06, + "loss": 0.2074, + "num_tokens": 3344257150.0, + "step": 4384 + }, + { + "epoch": 5.983218290249965, + "grad_norm": 0.20936902348140668, + "learning_rate": 4.843135670071566e-06, + "loss": 0.2016, + "num_tokens": 3345005544.0, + "step": 4385 + }, + { + "epoch": 5.984583871208645, + "grad_norm": 0.21476437537676563, + "learning_rate": 4.839510893556542e-06, + "loss": 0.2042, + "num_tokens": 3345769869.0, + "step": 4386 + }, + { + "epoch": 5.985949452167326, + "grad_norm": 0.19352730341941765, + "learning_rate": 4.835887996339955e-06, + "loss": 0.2128, + "num_tokens": 3346596643.0, + "step": 4387 + }, + { + "epoch": 5.9873150331260065, + "grad_norm": 0.2097472418758118, + "learning_rate": 4.832266979526999e-06, + "loss": 0.2087, + "num_tokens": 3347315899.0, + "step": 4388 + }, + { + "epoch": 5.9886806140846875, + "grad_norm": 0.23191262269922386, + "learning_rate": 4.828647844222284e-06, + "loss": 0.2054, + "num_tokens": 3348024161.0, + "step": 4389 + }, + { + "epoch": 5.990046195043368, + "grad_norm": 0.22236814387083373, + "learning_rate": 4.825030591529851e-06, + "loss": 0.2075, + "num_tokens": 3348700695.0, + "step": 4390 + }, + { + "epoch": 5.991411776002049, + "grad_norm": 0.20109500113738168, + "learning_rate": 4.821415222553171e-06, + "loss": 0.2086, + "num_tokens": 3349506009.0, + "step": 4391 + }, + { + "epoch": 5.992777356960729, + "grad_norm": 0.2586930525477737, + "learning_rate": 4.817801738395134e-06, + "loss": 0.2037, + "num_tokens": 3350256018.0, + "step": 4392 + }, + { + "epoch": 5.99414293791941, + "grad_norm": 0.216341773149177, + "learning_rate": 4.814190140158062e-06, + "loss": 0.2052, + "num_tokens": 3351011012.0, + "step": 4393 + }, + { + "epoch": 5.99550851887809, + "grad_norm": 0.2138425056568521, + "learning_rate": 4.8105804289436895e-06, + "loss": 0.2052, + "num_tokens": 3351795937.0, + "step": 4394 + }, + { + "epoch": 5.99687409983677, + "grad_norm": 0.2155563522910789, + "learning_rate": 4.806972605853189e-06, + "loss": 0.2111, + "num_tokens": 3352557398.0, + "step": 4395 + }, + { + "epoch": 5.998239680795451, + "grad_norm": 0.20535220067156243, + "learning_rate": 4.803366671987148e-06, + "loss": 0.2037, + "num_tokens": 3353309510.0, + "step": 4396 + }, + { + "epoch": 5.999605261754131, + "grad_norm": 0.20591093891082932, + "learning_rate": 4.799762628445585e-06, + "loss": 0.2179, + "num_tokens": 3354066255.0, + "step": 4397 + }, + { + "epoch": 6.0, + "grad_norm": 0.20591093891082932, + "learning_rate": 4.7961604763279315e-06, + "loss": 0.1946, + "num_tokens": 3354252816.0, + "step": 4398 + }, + { + "epoch": 6.00136558095868, + "grad_norm": 0.4585177996520023, + "learning_rate": 4.792560216733051e-06, + "loss": 0.181, + "num_tokens": 3355009547.0, + "step": 4399 + }, + { + "epoch": 6.002731161917361, + "grad_norm": 0.3912397225765619, + "learning_rate": 4.7889618507592275e-06, + "loss": 0.1798, + "num_tokens": 3355696719.0, + "step": 4400 + }, + { + "epoch": 6.004096742876041, + "grad_norm": 0.33710216367674195, + "learning_rate": 4.785365379504166e-06, + "loss": 0.1741, + "num_tokens": 3356437241.0, + "step": 4401 + }, + { + "epoch": 6.005462323834722, + "grad_norm": 0.2911734032369926, + "learning_rate": 4.781770804064998e-06, + "loss": 0.1819, + "num_tokens": 3357218728.0, + "step": 4402 + }, + { + "epoch": 6.006827904793402, + "grad_norm": 0.23838591071348517, + "learning_rate": 4.7781781255382645e-06, + "loss": 0.1764, + "num_tokens": 3357980336.0, + "step": 4403 + }, + { + "epoch": 6.008193485752083, + "grad_norm": 0.2621093596072259, + "learning_rate": 4.774587345019942e-06, + "loss": 0.1829, + "num_tokens": 3358761707.0, + "step": 4404 + }, + { + "epoch": 6.0095590667107635, + "grad_norm": 0.3444977322073399, + "learning_rate": 4.770998463605423e-06, + "loss": 0.1804, + "num_tokens": 3359497582.0, + "step": 4405 + }, + { + "epoch": 6.0109246476694445, + "grad_norm": 0.28407986906967087, + "learning_rate": 4.767411482389515e-06, + "loss": 0.1761, + "num_tokens": 3360266768.0, + "step": 4406 + }, + { + "epoch": 6.012290228628125, + "grad_norm": 0.2965888928676144, + "learning_rate": 4.763826402466453e-06, + "loss": 0.1831, + "num_tokens": 3361038867.0, + "step": 4407 + }, + { + "epoch": 6.013655809586805, + "grad_norm": 0.24350841758275243, + "learning_rate": 4.760243224929891e-06, + "loss": 0.1767, + "num_tokens": 3361786004.0, + "step": 4408 + }, + { + "epoch": 6.015021390545486, + "grad_norm": 0.3118467831195779, + "learning_rate": 4.7566619508729e-06, + "loss": 0.1764, + "num_tokens": 3362559674.0, + "step": 4409 + }, + { + "epoch": 6.016386971504166, + "grad_norm": 0.2546083036589664, + "learning_rate": 4.75308258138797e-06, + "loss": 0.1682, + "num_tokens": 3363236421.0, + "step": 4410 + }, + { + "epoch": 6.017752552462847, + "grad_norm": 0.25020424151247606, + "learning_rate": 4.749505117567019e-06, + "loss": 0.1801, + "num_tokens": 3363974049.0, + "step": 4411 + }, + { + "epoch": 6.019118133421527, + "grad_norm": 0.238830001859413, + "learning_rate": 4.745929560501366e-06, + "loss": 0.1729, + "num_tokens": 3364789946.0, + "step": 4412 + }, + { + "epoch": 6.020483714380208, + "grad_norm": 0.2264753968447385, + "learning_rate": 4.742355911281763e-06, + "loss": 0.1694, + "num_tokens": 3365568023.0, + "step": 4413 + }, + { + "epoch": 6.021849295338888, + "grad_norm": 0.21654944571911136, + "learning_rate": 4.73878417099838e-06, + "loss": 0.1702, + "num_tokens": 3366260484.0, + "step": 4414 + }, + { + "epoch": 6.023214876297569, + "grad_norm": 0.2584160311866936, + "learning_rate": 4.7352143407407895e-06, + "loss": 0.1751, + "num_tokens": 3366963404.0, + "step": 4415 + }, + { + "epoch": 6.024580457256249, + "grad_norm": 0.25247587751989753, + "learning_rate": 4.7316464215980056e-06, + "loss": 0.1764, + "num_tokens": 3367782536.0, + "step": 4416 + }, + { + "epoch": 6.025946038214929, + "grad_norm": 0.23335092427991314, + "learning_rate": 4.728080414658436e-06, + "loss": 0.1746, + "num_tokens": 3368492002.0, + "step": 4417 + }, + { + "epoch": 6.02731161917361, + "grad_norm": 0.227242114088925, + "learning_rate": 4.724516321009917e-06, + "loss": 0.1799, + "num_tokens": 3369247279.0, + "step": 4418 + }, + { + "epoch": 6.02867720013229, + "grad_norm": 0.22150753316181543, + "learning_rate": 4.720954141739702e-06, + "loss": 0.171, + "num_tokens": 3370025726.0, + "step": 4419 + }, + { + "epoch": 6.030042781090971, + "grad_norm": 0.23001812186809834, + "learning_rate": 4.717393877934458e-06, + "loss": 0.1742, + "num_tokens": 3370697764.0, + "step": 4420 + }, + { + "epoch": 6.0314083620496515, + "grad_norm": 0.24462877876179195, + "learning_rate": 4.713835530680263e-06, + "loss": 0.1789, + "num_tokens": 3371422974.0, + "step": 4421 + }, + { + "epoch": 6.0327739430083325, + "grad_norm": 0.22176696762083187, + "learning_rate": 4.710279101062616e-06, + "loss": 0.1695, + "num_tokens": 3372206398.0, + "step": 4422 + }, + { + "epoch": 6.034139523967013, + "grad_norm": 0.21552229185027025, + "learning_rate": 4.706724590166434e-06, + "loss": 0.1669, + "num_tokens": 3372940430.0, + "step": 4423 + }, + { + "epoch": 6.035505104925694, + "grad_norm": 0.2415655040465163, + "learning_rate": 4.7031719990760355e-06, + "loss": 0.1785, + "num_tokens": 3373715318.0, + "step": 4424 + }, + { + "epoch": 6.036870685884374, + "grad_norm": 0.2081411910962891, + "learning_rate": 4.699621328875173e-06, + "loss": 0.1797, + "num_tokens": 3374438548.0, + "step": 4425 + }, + { + "epoch": 6.038236266843054, + "grad_norm": 0.23066394910645086, + "learning_rate": 4.696072580646994e-06, + "loss": 0.1771, + "num_tokens": 3375183388.0, + "step": 4426 + }, + { + "epoch": 6.039601847801735, + "grad_norm": 0.2259500594816629, + "learning_rate": 4.692525755474071e-06, + "loss": 0.1742, + "num_tokens": 3375949950.0, + "step": 4427 + }, + { + "epoch": 6.040967428760415, + "grad_norm": 0.23536400447232905, + "learning_rate": 4.688980854438388e-06, + "loss": 0.1826, + "num_tokens": 3376741255.0, + "step": 4428 + }, + { + "epoch": 6.042333009719096, + "grad_norm": 0.21588219657404012, + "learning_rate": 4.685437878621339e-06, + "loss": 0.1758, + "num_tokens": 3377621258.0, + "step": 4429 + }, + { + "epoch": 6.043698590677776, + "grad_norm": 0.2148373107385538, + "learning_rate": 4.681896829103732e-06, + "loss": 0.1759, + "num_tokens": 3378354161.0, + "step": 4430 + }, + { + "epoch": 6.045064171636457, + "grad_norm": 0.23782115443865062, + "learning_rate": 4.6783577069657875e-06, + "loss": 0.1876, + "num_tokens": 3379080139.0, + "step": 4431 + }, + { + "epoch": 6.046429752595137, + "grad_norm": 0.22351969689844262, + "learning_rate": 4.674820513287142e-06, + "loss": 0.1661, + "num_tokens": 3379819878.0, + "step": 4432 + }, + { + "epoch": 6.047795333553818, + "grad_norm": 0.2233047141075785, + "learning_rate": 4.671285249146834e-06, + "loss": 0.1734, + "num_tokens": 3380548057.0, + "step": 4433 + }, + { + "epoch": 6.049160914512498, + "grad_norm": 0.2189354258042658, + "learning_rate": 4.667751915623328e-06, + "loss": 0.1827, + "num_tokens": 3381369615.0, + "step": 4434 + }, + { + "epoch": 6.050526495471178, + "grad_norm": 0.21842866551626267, + "learning_rate": 4.664220513794483e-06, + "loss": 0.1698, + "num_tokens": 3382148636.0, + "step": 4435 + }, + { + "epoch": 6.051892076429859, + "grad_norm": 0.21784368416807026, + "learning_rate": 4.6606910447375804e-06, + "loss": 0.1843, + "num_tokens": 3382901160.0, + "step": 4436 + }, + { + "epoch": 6.0532576573885395, + "grad_norm": 0.22789367116511622, + "learning_rate": 4.657163509529311e-06, + "loss": 0.181, + "num_tokens": 3383738696.0, + "step": 4437 + }, + { + "epoch": 6.0546232383472205, + "grad_norm": 0.23006143939050336, + "learning_rate": 4.653637909245767e-06, + "loss": 0.1716, + "num_tokens": 3384400405.0, + "step": 4438 + }, + { + "epoch": 6.055988819305901, + "grad_norm": 0.2214385613151796, + "learning_rate": 4.650114244962458e-06, + "loss": 0.1797, + "num_tokens": 3385235183.0, + "step": 4439 + }, + { + "epoch": 6.057354400264582, + "grad_norm": 0.22803755873452153, + "learning_rate": 4.6465925177543036e-06, + "loss": 0.1793, + "num_tokens": 3386014740.0, + "step": 4440 + }, + { + "epoch": 6.058719981223262, + "grad_norm": 0.21688395070305036, + "learning_rate": 4.643072728695632e-06, + "loss": 0.1818, + "num_tokens": 3386848644.0, + "step": 4441 + }, + { + "epoch": 6.060085562181943, + "grad_norm": 0.2162227270852833, + "learning_rate": 4.639554878860171e-06, + "loss": 0.1711, + "num_tokens": 3387609591.0, + "step": 4442 + }, + { + "epoch": 6.061451143140623, + "grad_norm": 0.21940339360493047, + "learning_rate": 4.636038969321073e-06, + "loss": 0.1773, + "num_tokens": 3388412567.0, + "step": 4443 + }, + { + "epoch": 6.062816724099303, + "grad_norm": 0.2341942673425418, + "learning_rate": 4.632525001150887e-06, + "loss": 0.1782, + "num_tokens": 3389198038.0, + "step": 4444 + }, + { + "epoch": 6.064182305057984, + "grad_norm": 0.2174948647541384, + "learning_rate": 4.629012975421566e-06, + "loss": 0.1773, + "num_tokens": 3390069029.0, + "step": 4445 + }, + { + "epoch": 6.065547886016664, + "grad_norm": 0.21484919381704048, + "learning_rate": 4.625502893204487e-06, + "loss": 0.177, + "num_tokens": 3390821019.0, + "step": 4446 + }, + { + "epoch": 6.066913466975345, + "grad_norm": 0.21567029198169105, + "learning_rate": 4.621994755570416e-06, + "loss": 0.176, + "num_tokens": 3391566032.0, + "step": 4447 + }, + { + "epoch": 6.068279047934025, + "grad_norm": 0.22083606918292492, + "learning_rate": 4.618488563589538e-06, + "loss": 0.1732, + "num_tokens": 3392316762.0, + "step": 4448 + }, + { + "epoch": 6.069644628892706, + "grad_norm": 0.2138134991843932, + "learning_rate": 4.614984318331439e-06, + "loss": 0.1749, + "num_tokens": 3393033351.0, + "step": 4449 + }, + { + "epoch": 6.071010209851386, + "grad_norm": 0.21376723582755067, + "learning_rate": 4.611482020865116e-06, + "loss": 0.1765, + "num_tokens": 3393730096.0, + "step": 4450 + }, + { + "epoch": 6.072375790810067, + "grad_norm": 0.2513444585246254, + "learning_rate": 4.6079816722589636e-06, + "loss": 0.1796, + "num_tokens": 3394541780.0, + "step": 4451 + }, + { + "epoch": 6.073741371768747, + "grad_norm": 0.2184302765928734, + "learning_rate": 4.604483273580786e-06, + "loss": 0.1795, + "num_tokens": 3395359254.0, + "step": 4452 + }, + { + "epoch": 6.0751069527274275, + "grad_norm": 0.22094451280438968, + "learning_rate": 4.6009868258977995e-06, + "loss": 0.1737, + "num_tokens": 3396095784.0, + "step": 4453 + }, + { + "epoch": 6.0764725336861085, + "grad_norm": 0.21994515782183258, + "learning_rate": 4.597492330276608e-06, + "loss": 0.1734, + "num_tokens": 3396822887.0, + "step": 4454 + }, + { + "epoch": 6.077838114644789, + "grad_norm": 0.22331371034204653, + "learning_rate": 4.593999787783245e-06, + "loss": 0.183, + "num_tokens": 3397611851.0, + "step": 4455 + }, + { + "epoch": 6.07920369560347, + "grad_norm": 0.2293000144112066, + "learning_rate": 4.5905091994831225e-06, + "loss": 0.1751, + "num_tokens": 3398318025.0, + "step": 4456 + }, + { + "epoch": 6.08056927656215, + "grad_norm": 0.22725068438461382, + "learning_rate": 4.587020566441071e-06, + "loss": 0.1646, + "num_tokens": 3399049300.0, + "step": 4457 + }, + { + "epoch": 6.081934857520831, + "grad_norm": 0.21259847673923732, + "learning_rate": 4.583533889721323e-06, + "loss": 0.1827, + "num_tokens": 3399921574.0, + "step": 4458 + }, + { + "epoch": 6.083300438479511, + "grad_norm": 0.24750316055113192, + "learning_rate": 4.580049170387515e-06, + "loss": 0.1786, + "num_tokens": 3400719251.0, + "step": 4459 + }, + { + "epoch": 6.084666019438192, + "grad_norm": 0.23655916061139723, + "learning_rate": 4.576566409502678e-06, + "loss": 0.1812, + "num_tokens": 3401467639.0, + "step": 4460 + }, + { + "epoch": 6.086031600396872, + "grad_norm": 0.2294654619347723, + "learning_rate": 4.573085608129253e-06, + "loss": 0.189, + "num_tokens": 3402161919.0, + "step": 4461 + }, + { + "epoch": 6.087397181355552, + "grad_norm": 0.22522034877832006, + "learning_rate": 4.5696067673290875e-06, + "loss": 0.1779, + "num_tokens": 3402916033.0, + "step": 4462 + }, + { + "epoch": 6.088762762314233, + "grad_norm": 0.2108770097286622, + "learning_rate": 4.566129888163413e-06, + "loss": 0.1785, + "num_tokens": 3403637405.0, + "step": 4463 + }, + { + "epoch": 6.090128343272913, + "grad_norm": 0.218034538404724, + "learning_rate": 4.562654971692889e-06, + "loss": 0.1736, + "num_tokens": 3404425002.0, + "step": 4464 + }, + { + "epoch": 6.091493924231594, + "grad_norm": 0.23074185975280587, + "learning_rate": 4.559182018977552e-06, + "loss": 0.1734, + "num_tokens": 3405215493.0, + "step": 4465 + }, + { + "epoch": 6.092859505190274, + "grad_norm": 0.2071715513395369, + "learning_rate": 4.555711031076854e-06, + "loss": 0.175, + "num_tokens": 3406008591.0, + "step": 4466 + }, + { + "epoch": 6.094225086148955, + "grad_norm": 0.22648826715597087, + "learning_rate": 4.552242009049644e-06, + "loss": 0.1788, + "num_tokens": 3406765138.0, + "step": 4467 + }, + { + "epoch": 6.095590667107635, + "grad_norm": 0.24025039898958242, + "learning_rate": 4.548774953954166e-06, + "loss": 0.1707, + "num_tokens": 3407468920.0, + "step": 4468 + }, + { + "epoch": 6.096956248066316, + "grad_norm": 0.23154871631644736, + "learning_rate": 4.545309866848071e-06, + "loss": 0.1791, + "num_tokens": 3408226598.0, + "step": 4469 + }, + { + "epoch": 6.0983218290249965, + "grad_norm": 0.2216963549401061, + "learning_rate": 4.5418467487884064e-06, + "loss": 0.1817, + "num_tokens": 3408954614.0, + "step": 4470 + }, + { + "epoch": 6.099687409983677, + "grad_norm": 0.2994055485967713, + "learning_rate": 4.538385600831622e-06, + "loss": 0.1782, + "num_tokens": 3409643700.0, + "step": 4471 + }, + { + "epoch": 6.101052990942358, + "grad_norm": 0.22041232213414622, + "learning_rate": 4.5349264240335574e-06, + "loss": 0.1802, + "num_tokens": 3410447795.0, + "step": 4472 + }, + { + "epoch": 6.102418571901038, + "grad_norm": 0.23199155100203914, + "learning_rate": 4.531469219449468e-06, + "loss": 0.1784, + "num_tokens": 3411245692.0, + "step": 4473 + }, + { + "epoch": 6.103784152859719, + "grad_norm": 0.21577669277288322, + "learning_rate": 4.528013988133987e-06, + "loss": 0.1784, + "num_tokens": 3411963004.0, + "step": 4474 + }, + { + "epoch": 6.105149733818399, + "grad_norm": 0.2417349130187108, + "learning_rate": 4.524560731141162e-06, + "loss": 0.1854, + "num_tokens": 3412725041.0, + "step": 4475 + }, + { + "epoch": 6.10651531477708, + "grad_norm": 0.21618688313939852, + "learning_rate": 4.5211094495244325e-06, + "loss": 0.1817, + "num_tokens": 3413477648.0, + "step": 4476 + }, + { + "epoch": 6.10788089573576, + "grad_norm": 0.23405788037925568, + "learning_rate": 4.5176601443366314e-06, + "loss": 0.1785, + "num_tokens": 3414244062.0, + "step": 4477 + }, + { + "epoch": 6.109246476694441, + "grad_norm": 0.2197580712372545, + "learning_rate": 4.514212816629993e-06, + "loss": 0.1754, + "num_tokens": 3415066114.0, + "step": 4478 + }, + { + "epoch": 6.110612057653121, + "grad_norm": 0.22440709069970965, + "learning_rate": 4.51076746745615e-06, + "loss": 0.1784, + "num_tokens": 3415798601.0, + "step": 4479 + }, + { + "epoch": 6.111977638611801, + "grad_norm": 0.22378451203640654, + "learning_rate": 4.507324097866126e-06, + "loss": 0.1818, + "num_tokens": 3416591434.0, + "step": 4480 + }, + { + "epoch": 6.113343219570482, + "grad_norm": 0.2421120759788064, + "learning_rate": 4.503882708910347e-06, + "loss": 0.1798, + "num_tokens": 3417386431.0, + "step": 4481 + }, + { + "epoch": 6.114708800529162, + "grad_norm": 0.21826931331134983, + "learning_rate": 4.500443301638633e-06, + "loss": 0.1749, + "num_tokens": 3418202590.0, + "step": 4482 + }, + { + "epoch": 6.116074381487843, + "grad_norm": 0.23908864840311186, + "learning_rate": 4.4970058771001925e-06, + "loss": 0.1852, + "num_tokens": 3419044902.0, + "step": 4483 + }, + { + "epoch": 6.117439962446523, + "grad_norm": 0.2208685361586619, + "learning_rate": 4.493570436343638e-06, + "loss": 0.1712, + "num_tokens": 3419861317.0, + "step": 4484 + }, + { + "epoch": 6.118805543405204, + "grad_norm": 0.21650802735400249, + "learning_rate": 4.490136980416978e-06, + "loss": 0.1826, + "num_tokens": 3420613672.0, + "step": 4485 + }, + { + "epoch": 6.1201711243638846, + "grad_norm": 0.20719033601158465, + "learning_rate": 4.486705510367605e-06, + "loss": 0.1778, + "num_tokens": 3421490447.0, + "step": 4486 + }, + { + "epoch": 6.1215367053225656, + "grad_norm": 0.21906419131915372, + "learning_rate": 4.483276027242313e-06, + "loss": 0.1833, + "num_tokens": 3422250044.0, + "step": 4487 + }, + { + "epoch": 6.122902286281246, + "grad_norm": 0.21810450746205956, + "learning_rate": 4.479848532087288e-06, + "loss": 0.1796, + "num_tokens": 3423065073.0, + "step": 4488 + }, + { + "epoch": 6.124267867239926, + "grad_norm": 0.20301923761216395, + "learning_rate": 4.4764230259481145e-06, + "loss": 0.1796, + "num_tokens": 3423817357.0, + "step": 4489 + }, + { + "epoch": 6.125633448198607, + "grad_norm": 0.22063538435052682, + "learning_rate": 4.472999509869766e-06, + "loss": 0.1911, + "num_tokens": 3424653070.0, + "step": 4490 + }, + { + "epoch": 6.126999029157287, + "grad_norm": 0.22120455236533326, + "learning_rate": 4.469577984896602e-06, + "loss": 0.1809, + "num_tokens": 3425514557.0, + "step": 4491 + }, + { + "epoch": 6.128364610115968, + "grad_norm": 0.22083477452312278, + "learning_rate": 4.466158452072388e-06, + "loss": 0.1753, + "num_tokens": 3426267036.0, + "step": 4492 + }, + { + "epoch": 6.129730191074648, + "grad_norm": 0.22188928878077294, + "learning_rate": 4.462740912440274e-06, + "loss": 0.1804, + "num_tokens": 3427004673.0, + "step": 4493 + }, + { + "epoch": 6.131095772033329, + "grad_norm": 0.23114820763976177, + "learning_rate": 4.459325367042804e-06, + "loss": 0.1694, + "num_tokens": 3427737863.0, + "step": 4494 + }, + { + "epoch": 6.132461352992009, + "grad_norm": 0.2233106292754563, + "learning_rate": 4.45591181692191e-06, + "loss": 0.1752, + "num_tokens": 3428510172.0, + "step": 4495 + }, + { + "epoch": 6.13382693395069, + "grad_norm": 0.33553311068362973, + "learning_rate": 4.452500263118922e-06, + "loss": 0.178, + "num_tokens": 3429166656.0, + "step": 4496 + }, + { + "epoch": 6.13519251490937, + "grad_norm": 0.22051775567612125, + "learning_rate": 4.449090706674554e-06, + "loss": 0.1801, + "num_tokens": 3429905954.0, + "step": 4497 + }, + { + "epoch": 6.13655809586805, + "grad_norm": 0.23927461724348775, + "learning_rate": 4.445683148628915e-06, + "loss": 0.1763, + "num_tokens": 3430563869.0, + "step": 4498 + }, + { + "epoch": 6.137923676826731, + "grad_norm": 0.23098489194021554, + "learning_rate": 4.442277590021509e-06, + "loss": 0.1721, + "num_tokens": 3431279791.0, + "step": 4499 + }, + { + "epoch": 6.1392892577854115, + "grad_norm": 0.21249157348244865, + "learning_rate": 4.4388740318912145e-06, + "loss": 0.1797, + "num_tokens": 3432049749.0, + "step": 4500 + }, + { + "epoch": 6.1406548387440925, + "grad_norm": 0.22697925713857062, + "learning_rate": 4.435472475276316e-06, + "loss": 0.1698, + "num_tokens": 3432829417.0, + "step": 4501 + }, + { + "epoch": 6.142020419702773, + "grad_norm": 0.22325960952336804, + "learning_rate": 4.432072921214479e-06, + "loss": 0.1753, + "num_tokens": 3433598933.0, + "step": 4502 + }, + { + "epoch": 6.143386000661454, + "grad_norm": 0.21779056871334146, + "learning_rate": 4.428675370742763e-06, + "loss": 0.1727, + "num_tokens": 3434403845.0, + "step": 4503 + }, + { + "epoch": 6.144751581620134, + "grad_norm": 0.21539000044813883, + "learning_rate": 4.425279824897608e-06, + "loss": 0.1781, + "num_tokens": 3435176577.0, + "step": 4504 + }, + { + "epoch": 6.146117162578815, + "grad_norm": 0.219663716664207, + "learning_rate": 4.421886284714856e-06, + "loss": 0.1813, + "num_tokens": 3436058064.0, + "step": 4505 + }, + { + "epoch": 6.147482743537495, + "grad_norm": 0.19782339122765638, + "learning_rate": 4.418494751229725e-06, + "loss": 0.183, + "num_tokens": 3436878576.0, + "step": 4506 + }, + { + "epoch": 6.148848324496175, + "grad_norm": 0.21203259430202637, + "learning_rate": 4.415105225476818e-06, + "loss": 0.1809, + "num_tokens": 3437673093.0, + "step": 4507 + }, + { + "epoch": 6.150213905454856, + "grad_norm": 0.2336275625240025, + "learning_rate": 4.4117177084901465e-06, + "loss": 0.1792, + "num_tokens": 3438375178.0, + "step": 4508 + }, + { + "epoch": 6.151579486413536, + "grad_norm": 0.2316479732396051, + "learning_rate": 4.408332201303084e-06, + "loss": 0.1721, + "num_tokens": 3439112672.0, + "step": 4509 + }, + { + "epoch": 6.152945067372217, + "grad_norm": 0.22163746027198764, + "learning_rate": 4.4049487049484055e-06, + "loss": 0.1797, + "num_tokens": 3439899433.0, + "step": 4510 + }, + { + "epoch": 6.154310648330897, + "grad_norm": 0.22028382182738873, + "learning_rate": 4.401567220458271e-06, + "loss": 0.1707, + "num_tokens": 3440609305.0, + "step": 4511 + }, + { + "epoch": 6.155676229289578, + "grad_norm": 0.23262887979016025, + "learning_rate": 4.398187748864226e-06, + "loss": 0.1805, + "num_tokens": 3441238978.0, + "step": 4512 + }, + { + "epoch": 6.157041810248258, + "grad_norm": 0.24789304673878493, + "learning_rate": 4.394810291197193e-06, + "loss": 0.1777, + "num_tokens": 3442056775.0, + "step": 4513 + }, + { + "epoch": 6.158407391206939, + "grad_norm": 0.20969774356356577, + "learning_rate": 4.391434848487498e-06, + "loss": 0.1717, + "num_tokens": 3442815152.0, + "step": 4514 + }, + { + "epoch": 6.159772972165619, + "grad_norm": 0.23276591209687553, + "learning_rate": 4.3880614217648395e-06, + "loss": 0.1838, + "num_tokens": 3443518406.0, + "step": 4515 + }, + { + "epoch": 6.1611385531242995, + "grad_norm": 0.2160518447031787, + "learning_rate": 4.3846900120582965e-06, + "loss": 0.1847, + "num_tokens": 3444280412.0, + "step": 4516 + }, + { + "epoch": 6.1625041340829805, + "grad_norm": 0.21765141216990866, + "learning_rate": 4.3813206203963535e-06, + "loss": 0.1742, + "num_tokens": 3444984664.0, + "step": 4517 + }, + { + "epoch": 6.163869715041661, + "grad_norm": 0.23683736636523095, + "learning_rate": 4.377953247806853e-06, + "loss": 0.1729, + "num_tokens": 3445790250.0, + "step": 4518 + }, + { + "epoch": 6.165235296000342, + "grad_norm": 0.22052113787648814, + "learning_rate": 4.374587895317042e-06, + "loss": 0.1774, + "num_tokens": 3446562538.0, + "step": 4519 + }, + { + "epoch": 6.166600876959022, + "grad_norm": 0.2272677270213607, + "learning_rate": 4.37122456395354e-06, + "loss": 0.1829, + "num_tokens": 3447364960.0, + "step": 4520 + }, + { + "epoch": 6.167966457917703, + "grad_norm": 0.21986032808084735, + "learning_rate": 4.367863254742361e-06, + "loss": 0.175, + "num_tokens": 3448198825.0, + "step": 4521 + }, + { + "epoch": 6.169332038876383, + "grad_norm": 0.23100630994472057, + "learning_rate": 4.364503968708885e-06, + "loss": 0.1789, + "num_tokens": 3448973515.0, + "step": 4522 + }, + { + "epoch": 6.170697619835064, + "grad_norm": 0.2324724777517825, + "learning_rate": 4.36114670687789e-06, + "loss": 0.1782, + "num_tokens": 3449732948.0, + "step": 4523 + }, + { + "epoch": 6.172063200793744, + "grad_norm": 0.22402977192388637, + "learning_rate": 4.357791470273534e-06, + "loss": 0.1719, + "num_tokens": 3450423957.0, + "step": 4524 + }, + { + "epoch": 6.173428781752424, + "grad_norm": 0.23447390052313588, + "learning_rate": 4.354438259919343e-06, + "loss": 0.1887, + "num_tokens": 3451285379.0, + "step": 4525 + }, + { + "epoch": 6.174794362711105, + "grad_norm": 0.21287784464370527, + "learning_rate": 4.351087076838251e-06, + "loss": 0.176, + "num_tokens": 3452094627.0, + "step": 4526 + }, + { + "epoch": 6.176159943669785, + "grad_norm": 0.22071043461752263, + "learning_rate": 4.347737922052549e-06, + "loss": 0.1736, + "num_tokens": 3452861074.0, + "step": 4527 + }, + { + "epoch": 6.177525524628466, + "grad_norm": 0.2277140706596248, + "learning_rate": 4.344390796583922e-06, + "loss": 0.1721, + "num_tokens": 3453596726.0, + "step": 4528 + }, + { + "epoch": 6.178891105587146, + "grad_norm": 0.20687993440965713, + "learning_rate": 4.3410457014534355e-06, + "loss": 0.1769, + "num_tokens": 3454321168.0, + "step": 4529 + }, + { + "epoch": 6.180256686545827, + "grad_norm": 0.22660369626935795, + "learning_rate": 4.337702637681529e-06, + "loss": 0.1679, + "num_tokens": 3455015952.0, + "step": 4530 + }, + { + "epoch": 6.181622267504507, + "grad_norm": 0.24304606269690304, + "learning_rate": 4.334361606288027e-06, + "loss": 0.1883, + "num_tokens": 3455784264.0, + "step": 4531 + }, + { + "epoch": 6.182987848463188, + "grad_norm": 0.23653850649487354, + "learning_rate": 4.3310226082921344e-06, + "loss": 0.1856, + "num_tokens": 3456527875.0, + "step": 4532 + }, + { + "epoch": 6.1843534294218685, + "grad_norm": 0.25063315056713936, + "learning_rate": 4.3276856447124395e-06, + "loss": 0.1776, + "num_tokens": 3457290527.0, + "step": 4533 + }, + { + "epoch": 6.185719010380549, + "grad_norm": 0.23021076779045613, + "learning_rate": 4.3243507165668954e-06, + "loss": 0.1849, + "num_tokens": 3458066186.0, + "step": 4534 + }, + { + "epoch": 6.18708459133923, + "grad_norm": 0.20518928027347919, + "learning_rate": 4.321017824872854e-06, + "loss": 0.1765, + "num_tokens": 3458870754.0, + "step": 4535 + }, + { + "epoch": 6.18845017229791, + "grad_norm": 0.22452478666694203, + "learning_rate": 4.31768697064703e-06, + "loss": 0.1798, + "num_tokens": 3459651791.0, + "step": 4536 + }, + { + "epoch": 6.189815753256591, + "grad_norm": 0.22135071290567507, + "learning_rate": 4.314358154905525e-06, + "loss": 0.175, + "num_tokens": 3460355593.0, + "step": 4537 + }, + { + "epoch": 6.191181334215271, + "grad_norm": 0.2302598794395475, + "learning_rate": 4.311031378663819e-06, + "loss": 0.1789, + "num_tokens": 3461035221.0, + "step": 4538 + }, + { + "epoch": 6.192546915173952, + "grad_norm": 0.2451951568308318, + "learning_rate": 4.307706642936761e-06, + "loss": 0.1858, + "num_tokens": 3461808868.0, + "step": 4539 + }, + { + "epoch": 6.193912496132632, + "grad_norm": 0.21250871549966083, + "learning_rate": 4.3043839487385895e-06, + "loss": 0.1738, + "num_tokens": 3462628996.0, + "step": 4540 + }, + { + "epoch": 6.195278077091313, + "grad_norm": 0.22757375689874892, + "learning_rate": 4.301063297082912e-06, + "loss": 0.1808, + "num_tokens": 3463377448.0, + "step": 4541 + }, + { + "epoch": 6.196643658049993, + "grad_norm": 0.22411977585476253, + "learning_rate": 4.297744688982719e-06, + "loss": 0.1847, + "num_tokens": 3464185628.0, + "step": 4542 + }, + { + "epoch": 6.198009239008673, + "grad_norm": 0.21718502314699714, + "learning_rate": 4.294428125450365e-06, + "loss": 0.1754, + "num_tokens": 3464979233.0, + "step": 4543 + }, + { + "epoch": 6.199374819967354, + "grad_norm": 0.24288196154103228, + "learning_rate": 4.2911136074976024e-06, + "loss": 0.18, + "num_tokens": 3465772271.0, + "step": 4544 + }, + { + "epoch": 6.200740400926034, + "grad_norm": 0.21077125200331845, + "learning_rate": 4.2878011361355386e-06, + "loss": 0.1811, + "num_tokens": 3466479562.0, + "step": 4545 + }, + { + "epoch": 6.202105981884715, + "grad_norm": 0.22794401023079633, + "learning_rate": 4.2844907123746685e-06, + "loss": 0.1779, + "num_tokens": 3467190071.0, + "step": 4546 + }, + { + "epoch": 6.203471562843395, + "grad_norm": 0.22703259780312576, + "learning_rate": 4.28118233722486e-06, + "loss": 0.1783, + "num_tokens": 3468019980.0, + "step": 4547 + }, + { + "epoch": 6.204837143802076, + "grad_norm": 0.22351404386950546, + "learning_rate": 4.27787601169535e-06, + "loss": 0.1804, + "num_tokens": 3468716464.0, + "step": 4548 + }, + { + "epoch": 6.2062027247607565, + "grad_norm": 0.22731524453914415, + "learning_rate": 4.274571736794758e-06, + "loss": 0.1716, + "num_tokens": 3469433911.0, + "step": 4549 + }, + { + "epoch": 6.2075683057194375, + "grad_norm": 0.2198255676591793, + "learning_rate": 4.271269513531075e-06, + "loss": 0.1812, + "num_tokens": 3470219400.0, + "step": 4550 + }, + { + "epoch": 6.208933886678118, + "grad_norm": 0.22030132556363524, + "learning_rate": 4.267969342911672e-06, + "loss": 0.1796, + "num_tokens": 3470966642.0, + "step": 4551 + }, + { + "epoch": 6.210299467636798, + "grad_norm": 0.23123225135684844, + "learning_rate": 4.2646712259432755e-06, + "loss": 0.1825, + "num_tokens": 3471709148.0, + "step": 4552 + }, + { + "epoch": 6.211665048595479, + "grad_norm": 0.220776375978549, + "learning_rate": 4.261375163632012e-06, + "loss": 0.1839, + "num_tokens": 3472527777.0, + "step": 4553 + }, + { + "epoch": 6.213030629554159, + "grad_norm": 0.22186720333694754, + "learning_rate": 4.2580811569833584e-06, + "loss": 0.1816, + "num_tokens": 3473302167.0, + "step": 4554 + }, + { + "epoch": 6.21439621051284, + "grad_norm": 0.22048593978822026, + "learning_rate": 4.254789207002177e-06, + "loss": 0.1843, + "num_tokens": 3474108664.0, + "step": 4555 + }, + { + "epoch": 6.21576179147152, + "grad_norm": 0.21574615102864872, + "learning_rate": 4.2514993146927e-06, + "loss": 0.1758, + "num_tokens": 3474804538.0, + "step": 4556 + }, + { + "epoch": 6.217127372430201, + "grad_norm": 0.22117376048687712, + "learning_rate": 4.248211481058526e-06, + "loss": 0.1784, + "num_tokens": 3475548271.0, + "step": 4557 + }, + { + "epoch": 6.218492953388881, + "grad_norm": 0.23166457922103698, + "learning_rate": 4.244925707102635e-06, + "loss": 0.1777, + "num_tokens": 3476291397.0, + "step": 4558 + }, + { + "epoch": 6.219858534347562, + "grad_norm": 0.21762628034392542, + "learning_rate": 4.241641993827373e-06, + "loss": 0.1825, + "num_tokens": 3477124355.0, + "step": 4559 + }, + { + "epoch": 6.221224115306242, + "grad_norm": 0.213444512991053, + "learning_rate": 4.238360342234461e-06, + "loss": 0.1799, + "num_tokens": 3477906054.0, + "step": 4560 + }, + { + "epoch": 6.222589696264922, + "grad_norm": 0.22204746703167785, + "learning_rate": 4.235080753324985e-06, + "loss": 0.1777, + "num_tokens": 3478660140.0, + "step": 4561 + }, + { + "epoch": 6.223955277223603, + "grad_norm": 0.2744635911610287, + "learning_rate": 4.231803228099408e-06, + "loss": 0.1725, + "num_tokens": 3479478041.0, + "step": 4562 + }, + { + "epoch": 6.225320858182283, + "grad_norm": 0.2229204134558285, + "learning_rate": 4.22852776755756e-06, + "loss": 0.1792, + "num_tokens": 3480180854.0, + "step": 4563 + }, + { + "epoch": 6.226686439140964, + "grad_norm": 0.2214870107436558, + "learning_rate": 4.225254372698641e-06, + "loss": 0.1762, + "num_tokens": 3480959968.0, + "step": 4564 + }, + { + "epoch": 6.2280520200996445, + "grad_norm": 0.22565969202025818, + "learning_rate": 4.221983044521229e-06, + "loss": 0.1755, + "num_tokens": 3481780576.0, + "step": 4565 + }, + { + "epoch": 6.2294176010583255, + "grad_norm": 0.21302069759579312, + "learning_rate": 4.218713784023255e-06, + "loss": 0.1763, + "num_tokens": 3482538169.0, + "step": 4566 + }, + { + "epoch": 6.230783182017006, + "grad_norm": 0.21108712225024537, + "learning_rate": 4.215446592202033e-06, + "loss": 0.1797, + "num_tokens": 3483309776.0, + "step": 4567 + }, + { + "epoch": 6.232148762975687, + "grad_norm": 0.21318830661838534, + "learning_rate": 4.2121814700542414e-06, + "loss": 0.1731, + "num_tokens": 3484077354.0, + "step": 4568 + }, + { + "epoch": 6.233514343934367, + "grad_norm": 0.21814291554272444, + "learning_rate": 4.208918418575928e-06, + "loss": 0.1736, + "num_tokens": 3484863806.0, + "step": 4569 + }, + { + "epoch": 6.234879924893047, + "grad_norm": 0.2142705634924007, + "learning_rate": 4.205657438762511e-06, + "loss": 0.181, + "num_tokens": 3485718376.0, + "step": 4570 + }, + { + "epoch": 6.236245505851728, + "grad_norm": 0.21529194721794206, + "learning_rate": 4.20239853160877e-06, + "loss": 0.1695, + "num_tokens": 3486430360.0, + "step": 4571 + }, + { + "epoch": 6.237611086810408, + "grad_norm": 0.22063129748436203, + "learning_rate": 4.199141698108856e-06, + "loss": 0.1827, + "num_tokens": 3487188606.0, + "step": 4572 + }, + { + "epoch": 6.238976667769089, + "grad_norm": 0.2307116979269138, + "learning_rate": 4.195886939256291e-06, + "loss": 0.1801, + "num_tokens": 3487953543.0, + "step": 4573 + }, + { + "epoch": 6.240342248727769, + "grad_norm": 0.22287422631113657, + "learning_rate": 4.192634256043963e-06, + "loss": 0.1792, + "num_tokens": 3488739397.0, + "step": 4574 + }, + { + "epoch": 6.24170782968645, + "grad_norm": 0.20953073984959866, + "learning_rate": 4.1893836494641185e-06, + "loss": 0.1857, + "num_tokens": 3489591658.0, + "step": 4575 + }, + { + "epoch": 6.24307341064513, + "grad_norm": 0.2155306505891302, + "learning_rate": 4.186135120508379e-06, + "loss": 0.1759, + "num_tokens": 3490270154.0, + "step": 4576 + }, + { + "epoch": 6.244438991603811, + "grad_norm": 0.22675852117196066, + "learning_rate": 4.182888670167733e-06, + "loss": 0.1786, + "num_tokens": 3490938599.0, + "step": 4577 + }, + { + "epoch": 6.245804572562491, + "grad_norm": 0.22114424595309687, + "learning_rate": 4.179644299432526e-06, + "loss": 0.1825, + "num_tokens": 3491691608.0, + "step": 4578 + }, + { + "epoch": 6.247170153521171, + "grad_norm": 0.22130197307457095, + "learning_rate": 4.176402009292483e-06, + "loss": 0.1842, + "num_tokens": 3492428160.0, + "step": 4579 + }, + { + "epoch": 6.248535734479852, + "grad_norm": 0.24113597507233553, + "learning_rate": 4.17316180073668e-06, + "loss": 0.1871, + "num_tokens": 3493268337.0, + "step": 4580 + }, + { + "epoch": 6.249901315438533, + "grad_norm": 0.2342872229189589, + "learning_rate": 4.169923674753566e-06, + "loss": 0.1847, + "num_tokens": 3494004643.0, + "step": 4581 + }, + { + "epoch": 6.251266896397214, + "grad_norm": 0.20985481049045268, + "learning_rate": 4.166687632330954e-06, + "loss": 0.1793, + "num_tokens": 3494777242.0, + "step": 4582 + }, + { + "epoch": 6.252632477355894, + "grad_norm": 0.2182906627022529, + "learning_rate": 4.163453674456022e-06, + "loss": 0.1852, + "num_tokens": 3495611764.0, + "step": 4583 + }, + { + "epoch": 6.253998058314575, + "grad_norm": 0.21525479474901657, + "learning_rate": 4.160221802115305e-06, + "loss": 0.185, + "num_tokens": 3496354529.0, + "step": 4584 + }, + { + "epoch": 6.255363639273255, + "grad_norm": 0.2360289916713184, + "learning_rate": 4.156992016294712e-06, + "loss": 0.182, + "num_tokens": 3497144767.0, + "step": 4585 + }, + { + "epoch": 6.256729220231936, + "grad_norm": 0.22642382925621618, + "learning_rate": 4.153764317979511e-06, + "loss": 0.175, + "num_tokens": 3497844724.0, + "step": 4586 + }, + { + "epoch": 6.258094801190616, + "grad_norm": 0.22209425149287665, + "learning_rate": 4.150538708154326e-06, + "loss": 0.182, + "num_tokens": 3498572429.0, + "step": 4587 + }, + { + "epoch": 6.259460382149296, + "grad_norm": 0.2323186731317651, + "learning_rate": 4.147315187803161e-06, + "loss": 0.172, + "num_tokens": 3499266427.0, + "step": 4588 + }, + { + "epoch": 6.260825963107977, + "grad_norm": 0.23086929032987255, + "learning_rate": 4.144093757909365e-06, + "loss": 0.184, + "num_tokens": 3500080062.0, + "step": 4589 + }, + { + "epoch": 6.262191544066657, + "grad_norm": 0.2210544528967968, + "learning_rate": 4.1408744194556615e-06, + "loss": 0.1802, + "num_tokens": 3500895054.0, + "step": 4590 + }, + { + "epoch": 6.263557125025338, + "grad_norm": 0.21735116796503404, + "learning_rate": 4.137657173424128e-06, + "loss": 0.1823, + "num_tokens": 3501620088.0, + "step": 4591 + }, + { + "epoch": 6.264922705984018, + "grad_norm": 0.23619968376155812, + "learning_rate": 4.1344420207962086e-06, + "loss": 0.1752, + "num_tokens": 3502307483.0, + "step": 4592 + }, + { + "epoch": 6.266288286942699, + "grad_norm": 0.2321990526738248, + "learning_rate": 4.131228962552707e-06, + "loss": 0.1655, + "num_tokens": 3502992365.0, + "step": 4593 + }, + { + "epoch": 6.267653867901379, + "grad_norm": 0.23434853542778, + "learning_rate": 4.128017999673786e-06, + "loss": 0.1748, + "num_tokens": 3503751298.0, + "step": 4594 + }, + { + "epoch": 6.26901944886006, + "grad_norm": 0.22439757606691704, + "learning_rate": 4.124809133138977e-06, + "loss": 0.1773, + "num_tokens": 3504505387.0, + "step": 4595 + }, + { + "epoch": 6.2703850298187405, + "grad_norm": 0.22132451061902028, + "learning_rate": 4.121602363927155e-06, + "loss": 0.1771, + "num_tokens": 3505223012.0, + "step": 4596 + }, + { + "epoch": 6.271750610777421, + "grad_norm": 0.22446146734086175, + "learning_rate": 4.118397693016582e-06, + "loss": 0.1772, + "num_tokens": 3505873858.0, + "step": 4597 + }, + { + "epoch": 6.273116191736102, + "grad_norm": 0.23785912841374188, + "learning_rate": 4.115195121384851e-06, + "loss": 0.1721, + "num_tokens": 3506574211.0, + "step": 4598 + }, + { + "epoch": 6.274481772694782, + "grad_norm": 0.22584758060774557, + "learning_rate": 4.111994650008933e-06, + "loss": 0.1833, + "num_tokens": 3507348540.0, + "step": 4599 + }, + { + "epoch": 6.275847353653463, + "grad_norm": 0.2069471263560601, + "learning_rate": 4.108796279865156e-06, + "loss": 0.1792, + "num_tokens": 3508064816.0, + "step": 4600 + }, + { + "epoch": 6.277212934612143, + "grad_norm": 0.22849604017584552, + "learning_rate": 4.1056000119292e-06, + "loss": 0.1813, + "num_tokens": 3508834231.0, + "step": 4601 + }, + { + "epoch": 6.278578515570824, + "grad_norm": 0.220626860639284, + "learning_rate": 4.102405847176108e-06, + "loss": 0.1739, + "num_tokens": 3509620070.0, + "step": 4602 + }, + { + "epoch": 6.279944096529504, + "grad_norm": 0.20824367311636907, + "learning_rate": 4.099213786580282e-06, + "loss": 0.1773, + "num_tokens": 3510319187.0, + "step": 4603 + }, + { + "epoch": 6.281309677488185, + "grad_norm": 0.22869293644876434, + "learning_rate": 4.096023831115485e-06, + "loss": 0.1859, + "num_tokens": 3511110675.0, + "step": 4604 + }, + { + "epoch": 6.282675258446865, + "grad_norm": 0.22852341386589692, + "learning_rate": 4.0928359817548275e-06, + "loss": 0.1803, + "num_tokens": 3511831234.0, + "step": 4605 + }, + { + "epoch": 6.284040839405545, + "grad_norm": 0.24648044263110894, + "learning_rate": 4.089650239470792e-06, + "loss": 0.1825, + "num_tokens": 3512589321.0, + "step": 4606 + }, + { + "epoch": 6.285406420364226, + "grad_norm": 0.2156025648603416, + "learning_rate": 4.0864666052352054e-06, + "loss": 0.1804, + "num_tokens": 3513447670.0, + "step": 4607 + }, + { + "epoch": 6.286772001322906, + "grad_norm": 0.21862622097717746, + "learning_rate": 4.083285080019257e-06, + "loss": 0.1845, + "num_tokens": 3514202403.0, + "step": 4608 + }, + { + "epoch": 6.288137582281587, + "grad_norm": 0.2304846549022789, + "learning_rate": 4.080105664793496e-06, + "loss": 0.1859, + "num_tokens": 3515010614.0, + "step": 4609 + }, + { + "epoch": 6.289503163240267, + "grad_norm": 0.21189551335791587, + "learning_rate": 4.076928360527819e-06, + "loss": 0.1765, + "num_tokens": 3515788396.0, + "step": 4610 + }, + { + "epoch": 6.290868744198948, + "grad_norm": 0.2268812022758147, + "learning_rate": 4.0737531681914864e-06, + "loss": 0.1785, + "num_tokens": 3516520038.0, + "step": 4611 + }, + { + "epoch": 6.2922343251576285, + "grad_norm": 0.2246026052922428, + "learning_rate": 4.070580088753111e-06, + "loss": 0.1786, + "num_tokens": 3517320143.0, + "step": 4612 + }, + { + "epoch": 6.2935999061163095, + "grad_norm": 0.2171828381558734, + "learning_rate": 4.067409123180666e-06, + "loss": 0.1764, + "num_tokens": 3518097785.0, + "step": 4613 + }, + { + "epoch": 6.29496548707499, + "grad_norm": 0.2294986502104526, + "learning_rate": 4.064240272441467e-06, + "loss": 0.1757, + "num_tokens": 3518852718.0, + "step": 4614 + }, + { + "epoch": 6.29633106803367, + "grad_norm": 0.21110089024395373, + "learning_rate": 4.061073537502205e-06, + "loss": 0.1798, + "num_tokens": 3519572072.0, + "step": 4615 + }, + { + "epoch": 6.297696648992351, + "grad_norm": 0.2397500487031368, + "learning_rate": 4.057908919328907e-06, + "loss": 0.1914, + "num_tokens": 3520359111.0, + "step": 4616 + }, + { + "epoch": 6.299062229951031, + "grad_norm": 0.21829353717483338, + "learning_rate": 4.054746418886956e-06, + "loss": 0.1856, + "num_tokens": 3521137754.0, + "step": 4617 + }, + { + "epoch": 6.300427810909712, + "grad_norm": 0.2321374240712012, + "learning_rate": 4.051586037141103e-06, + "loss": 0.185, + "num_tokens": 3521896847.0, + "step": 4618 + }, + { + "epoch": 6.301793391868392, + "grad_norm": 0.222466741972722, + "learning_rate": 4.048427775055439e-06, + "loss": 0.1788, + "num_tokens": 3522641389.0, + "step": 4619 + }, + { + "epoch": 6.303158972827073, + "grad_norm": 0.2299971645339046, + "learning_rate": 4.045271633593414e-06, + "loss": 0.1825, + "num_tokens": 3523373801.0, + "step": 4620 + }, + { + "epoch": 6.304524553785753, + "grad_norm": 0.2236939817377869, + "learning_rate": 4.04211761371783e-06, + "loss": 0.1754, + "num_tokens": 3524114552.0, + "step": 4621 + }, + { + "epoch": 6.305890134744434, + "grad_norm": 0.2191320558252902, + "learning_rate": 4.0389657163908425e-06, + "loss": 0.1841, + "num_tokens": 3525025106.0, + "step": 4622 + }, + { + "epoch": 6.307255715703114, + "grad_norm": 0.24162996500865816, + "learning_rate": 4.035815942573957e-06, + "loss": 0.174, + "num_tokens": 3525793061.0, + "step": 4623 + }, + { + "epoch": 6.308621296661794, + "grad_norm": 0.2002310090519619, + "learning_rate": 4.0326682932280336e-06, + "loss": 0.1817, + "num_tokens": 3526553122.0, + "step": 4624 + }, + { + "epoch": 6.309986877620475, + "grad_norm": 0.22024577044935845, + "learning_rate": 4.029522769313285e-06, + "loss": 0.1801, + "num_tokens": 3527355848.0, + "step": 4625 + }, + { + "epoch": 6.311352458579155, + "grad_norm": 0.22675363436552176, + "learning_rate": 4.026379371789274e-06, + "loss": 0.188, + "num_tokens": 3528120215.0, + "step": 4626 + }, + { + "epoch": 6.312718039537836, + "grad_norm": 0.22968293993848335, + "learning_rate": 4.023238101614917e-06, + "loss": 0.1784, + "num_tokens": 3528894356.0, + "step": 4627 + }, + { + "epoch": 6.3140836204965165, + "grad_norm": 0.22299539668455945, + "learning_rate": 4.0200989597484756e-06, + "loss": 0.1824, + "num_tokens": 3529765730.0, + "step": 4628 + }, + { + "epoch": 6.3154492014551975, + "grad_norm": 0.2191179847060652, + "learning_rate": 4.016961947147566e-06, + "loss": 0.1824, + "num_tokens": 3530548872.0, + "step": 4629 + }, + { + "epoch": 6.316814782413878, + "grad_norm": 0.23206025959905505, + "learning_rate": 4.013827064769158e-06, + "loss": 0.1809, + "num_tokens": 3531326054.0, + "step": 4630 + }, + { + "epoch": 6.318180363372559, + "grad_norm": 0.20914965197334484, + "learning_rate": 4.010694313569568e-06, + "loss": 0.1796, + "num_tokens": 3532057861.0, + "step": 4631 + }, + { + "epoch": 6.319545944331239, + "grad_norm": 0.221436283712645, + "learning_rate": 4.007563694504462e-06, + "loss": 0.1858, + "num_tokens": 3532875188.0, + "step": 4632 + }, + { + "epoch": 6.320911525289919, + "grad_norm": 0.22038072546242202, + "learning_rate": 4.004435208528854e-06, + "loss": 0.1885, + "num_tokens": 3533637412.0, + "step": 4633 + }, + { + "epoch": 6.3222771062486, + "grad_norm": 0.22486531339561402, + "learning_rate": 4.0013088565971126e-06, + "loss": 0.1759, + "num_tokens": 3534342808.0, + "step": 4634 + }, + { + "epoch": 6.32364268720728, + "grad_norm": 0.23928908114153905, + "learning_rate": 3.998184639662951e-06, + "loss": 0.1743, + "num_tokens": 3535110054.0, + "step": 4635 + }, + { + "epoch": 6.325008268165961, + "grad_norm": 0.21123091861163099, + "learning_rate": 3.9950625586794365e-06, + "loss": 0.1785, + "num_tokens": 3535836612.0, + "step": 4636 + }, + { + "epoch": 6.326373849124641, + "grad_norm": 0.22745381323725103, + "learning_rate": 3.991942614598974e-06, + "loss": 0.1754, + "num_tokens": 3536573923.0, + "step": 4637 + }, + { + "epoch": 6.327739430083322, + "grad_norm": 0.2247523369746374, + "learning_rate": 3.988824808373326e-06, + "loss": 0.1839, + "num_tokens": 3537321773.0, + "step": 4638 + }, + { + "epoch": 6.329105011042002, + "grad_norm": 0.24512911610804214, + "learning_rate": 3.9857091409536044e-06, + "loss": 0.1713, + "num_tokens": 3538040214.0, + "step": 4639 + }, + { + "epoch": 6.330470592000683, + "grad_norm": 0.22850361471914524, + "learning_rate": 3.982595613290258e-06, + "loss": 0.1778, + "num_tokens": 3538740712.0, + "step": 4640 + }, + { + "epoch": 6.331836172959363, + "grad_norm": 0.2181433212971442, + "learning_rate": 3.979484226333092e-06, + "loss": 0.1838, + "num_tokens": 3539472508.0, + "step": 4641 + }, + { + "epoch": 6.333201753918043, + "grad_norm": 0.23936787051984446, + "learning_rate": 3.976374981031256e-06, + "loss": 0.1799, + "num_tokens": 3540207921.0, + "step": 4642 + }, + { + "epoch": 6.334567334876724, + "grad_norm": 0.22896106121475626, + "learning_rate": 3.973267878333246e-06, + "loss": 0.1914, + "num_tokens": 3541059715.0, + "step": 4643 + }, + { + "epoch": 6.3359329158354045, + "grad_norm": 0.21436586729513096, + "learning_rate": 3.970162919186904e-06, + "loss": 0.1779, + "num_tokens": 3541872348.0, + "step": 4644 + }, + { + "epoch": 6.3372984967940855, + "grad_norm": 0.2365491781992192, + "learning_rate": 3.967060104539422e-06, + "loss": 0.1814, + "num_tokens": 3542563785.0, + "step": 4645 + }, + { + "epoch": 6.338664077752766, + "grad_norm": 0.22180108253027245, + "learning_rate": 3.963959435337328e-06, + "loss": 0.1796, + "num_tokens": 3543395323.0, + "step": 4646 + }, + { + "epoch": 6.340029658711447, + "grad_norm": 0.2086760766404233, + "learning_rate": 3.9608609125265065e-06, + "loss": 0.1771, + "num_tokens": 3544132690.0, + "step": 4647 + }, + { + "epoch": 6.341395239670127, + "grad_norm": 0.2305601756663565, + "learning_rate": 3.957764537052182e-06, + "loss": 0.1886, + "num_tokens": 3544918936.0, + "step": 4648 + }, + { + "epoch": 6.342760820628808, + "grad_norm": 0.2379074699616871, + "learning_rate": 3.954670309858924e-06, + "loss": 0.1778, + "num_tokens": 3545656346.0, + "step": 4649 + }, + { + "epoch": 6.344126401587488, + "grad_norm": 0.22680761925206733, + "learning_rate": 3.951578231890645e-06, + "loss": 0.1838, + "num_tokens": 3546321398.0, + "step": 4650 + }, + { + "epoch": 6.345491982546168, + "grad_norm": 0.23507353210246723, + "learning_rate": 3.948488304090607e-06, + "loss": 0.1886, + "num_tokens": 3547078530.0, + "step": 4651 + }, + { + "epoch": 6.346857563504849, + "grad_norm": 0.21148751720868564, + "learning_rate": 3.945400527401413e-06, + "loss": 0.1831, + "num_tokens": 3547874258.0, + "step": 4652 + }, + { + "epoch": 6.348223144463529, + "grad_norm": 0.20918958158014256, + "learning_rate": 3.942314902765008e-06, + "loss": 0.1715, + "num_tokens": 3548627865.0, + "step": 4653 + }, + { + "epoch": 6.34958872542221, + "grad_norm": 0.23489495661402937, + "learning_rate": 3.939231431122687e-06, + "loss": 0.1872, + "num_tokens": 3549423721.0, + "step": 4654 + }, + { + "epoch": 6.35095430638089, + "grad_norm": 0.23431304774191444, + "learning_rate": 3.9361501134150795e-06, + "loss": 0.1827, + "num_tokens": 3550157737.0, + "step": 4655 + }, + { + "epoch": 6.352319887339571, + "grad_norm": 0.2222385081460156, + "learning_rate": 3.933070950582162e-06, + "loss": 0.1768, + "num_tokens": 3550943083.0, + "step": 4656 + }, + { + "epoch": 6.353685468298251, + "grad_norm": 0.21862820456026383, + "learning_rate": 3.929993943563256e-06, + "loss": 0.1784, + "num_tokens": 3551699719.0, + "step": 4657 + }, + { + "epoch": 6.355051049256932, + "grad_norm": 0.21998676786638408, + "learning_rate": 3.926919093297017e-06, + "loss": 0.1751, + "num_tokens": 3552438492.0, + "step": 4658 + }, + { + "epoch": 6.356416630215612, + "grad_norm": 0.2369966964247266, + "learning_rate": 3.9238464007214605e-06, + "loss": 0.1753, + "num_tokens": 3553167155.0, + "step": 4659 + }, + { + "epoch": 6.3577822111742925, + "grad_norm": 0.21604083630285426, + "learning_rate": 3.9207758667739214e-06, + "loss": 0.1821, + "num_tokens": 3553891467.0, + "step": 4660 + }, + { + "epoch": 6.3591477921329735, + "grad_norm": 0.2357963959873284, + "learning_rate": 3.917707492391091e-06, + "loss": 0.1807, + "num_tokens": 3554615623.0, + "step": 4661 + }, + { + "epoch": 6.360513373091654, + "grad_norm": 0.22546090470128136, + "learning_rate": 3.914641278508997e-06, + "loss": 0.1847, + "num_tokens": 3555347041.0, + "step": 4662 + }, + { + "epoch": 6.361878954050335, + "grad_norm": 0.22650929042883072, + "learning_rate": 3.9115772260630075e-06, + "loss": 0.1754, + "num_tokens": 3556119211.0, + "step": 4663 + }, + { + "epoch": 6.363244535009015, + "grad_norm": 0.222244021830333, + "learning_rate": 3.908515335987833e-06, + "loss": 0.1834, + "num_tokens": 3556841686.0, + "step": 4664 + }, + { + "epoch": 6.364610115967696, + "grad_norm": 0.22426117139032098, + "learning_rate": 3.905455609217522e-06, + "loss": 0.1897, + "num_tokens": 3557625885.0, + "step": 4665 + }, + { + "epoch": 6.365975696926376, + "grad_norm": 0.23175864754082223, + "learning_rate": 3.902398046685469e-06, + "loss": 0.1856, + "num_tokens": 3558487340.0, + "step": 4666 + }, + { + "epoch": 6.367341277885057, + "grad_norm": 0.21772293639261567, + "learning_rate": 3.899342649324396e-06, + "loss": 0.174, + "num_tokens": 3559233752.0, + "step": 4667 + }, + { + "epoch": 6.368706858843737, + "grad_norm": 0.21983357625405994, + "learning_rate": 3.8962894180663835e-06, + "loss": 0.1717, + "num_tokens": 3560019530.0, + "step": 4668 + }, + { + "epoch": 6.370072439802417, + "grad_norm": 0.2167223008211721, + "learning_rate": 3.893238353842831e-06, + "loss": 0.183, + "num_tokens": 3560818457.0, + "step": 4669 + }, + { + "epoch": 6.371438020761098, + "grad_norm": 0.22894878226125323, + "learning_rate": 3.890189457584488e-06, + "loss": 0.1737, + "num_tokens": 3561567263.0, + "step": 4670 + }, + { + "epoch": 6.372803601719778, + "grad_norm": 0.24209522913506676, + "learning_rate": 3.887142730221445e-06, + "loss": 0.1857, + "num_tokens": 3562364537.0, + "step": 4671 + }, + { + "epoch": 6.374169182678459, + "grad_norm": 0.21949835848089644, + "learning_rate": 3.8840981726831206e-06, + "loss": 0.1885, + "num_tokens": 3563196530.0, + "step": 4672 + }, + { + "epoch": 6.375534763637139, + "grad_norm": 0.2267978540315638, + "learning_rate": 3.881055785898281e-06, + "loss": 0.178, + "num_tokens": 3563980698.0, + "step": 4673 + }, + { + "epoch": 6.37690034459582, + "grad_norm": 0.2169501316218299, + "learning_rate": 3.8780155707950255e-06, + "loss": 0.1803, + "num_tokens": 3564723579.0, + "step": 4674 + }, + { + "epoch": 6.3782659255545004, + "grad_norm": 0.23829579362694697, + "learning_rate": 3.874977528300794e-06, + "loss": 0.1784, + "num_tokens": 3565479726.0, + "step": 4675 + }, + { + "epoch": 6.3796315065131814, + "grad_norm": 0.2041272829998039, + "learning_rate": 3.871941659342355e-06, + "loss": 0.1877, + "num_tokens": 3566287084.0, + "step": 4676 + }, + { + "epoch": 6.380997087471862, + "grad_norm": 0.22647911802883477, + "learning_rate": 3.868907964845833e-06, + "loss": 0.1847, + "num_tokens": 3567103567.0, + "step": 4677 + }, + { + "epoch": 6.382362668430542, + "grad_norm": 0.21582789442232744, + "learning_rate": 3.865876445736667e-06, + "loss": 0.1833, + "num_tokens": 3567937581.0, + "step": 4678 + }, + { + "epoch": 6.383728249389223, + "grad_norm": 0.2376580691182321, + "learning_rate": 3.862847102939642e-06, + "loss": 0.1795, + "num_tokens": 3568629369.0, + "step": 4679 + }, + { + "epoch": 6.385093830347903, + "grad_norm": 0.22200162539019003, + "learning_rate": 3.859819937378885e-06, + "loss": 0.1748, + "num_tokens": 3569371182.0, + "step": 4680 + }, + { + "epoch": 6.386459411306584, + "grad_norm": 0.22285516925533702, + "learning_rate": 3.856794949977849e-06, + "loss": 0.1809, + "num_tokens": 3570143252.0, + "step": 4681 + }, + { + "epoch": 6.387824992265264, + "grad_norm": 0.2151166707277959, + "learning_rate": 3.853772141659328e-06, + "loss": 0.1807, + "num_tokens": 3570932834.0, + "step": 4682 + }, + { + "epoch": 6.389190573223945, + "grad_norm": 0.2205795948056368, + "learning_rate": 3.850751513345449e-06, + "loss": 0.1719, + "num_tokens": 3571671921.0, + "step": 4683 + }, + { + "epoch": 6.390556154182625, + "grad_norm": 0.224445272868465, + "learning_rate": 3.847733065957679e-06, + "loss": 0.1748, + "num_tokens": 3572378021.0, + "step": 4684 + }, + { + "epoch": 6.391921735141306, + "grad_norm": 0.22407646267725653, + "learning_rate": 3.844716800416806e-06, + "loss": 0.1837, + "num_tokens": 3573148087.0, + "step": 4685 + }, + { + "epoch": 6.393287316099986, + "grad_norm": 0.23330017100166128, + "learning_rate": 3.841702717642975e-06, + "loss": 0.176, + "num_tokens": 3573886052.0, + "step": 4686 + }, + { + "epoch": 6.394652897058666, + "grad_norm": 0.23130023290693846, + "learning_rate": 3.838690818555642e-06, + "loss": 0.1839, + "num_tokens": 3574623596.0, + "step": 4687 + }, + { + "epoch": 6.396018478017347, + "grad_norm": 0.23736709335480422, + "learning_rate": 3.8356811040736085e-06, + "loss": 0.1862, + "num_tokens": 3575381442.0, + "step": 4688 + }, + { + "epoch": 6.397384058976027, + "grad_norm": 0.22543047740927208, + "learning_rate": 3.8326735751150145e-06, + "loss": 0.1791, + "num_tokens": 3576216422.0, + "step": 4689 + }, + { + "epoch": 6.398749639934708, + "grad_norm": 0.22514718013219293, + "learning_rate": 3.829668232597319e-06, + "loss": 0.1775, + "num_tokens": 3576924796.0, + "step": 4690 + }, + { + "epoch": 6.4001152208933885, + "grad_norm": 0.2160098441528086, + "learning_rate": 3.826665077437325e-06, + "loss": 0.1809, + "num_tokens": 3577681690.0, + "step": 4691 + }, + { + "epoch": 6.4014808018520695, + "grad_norm": 0.23029601763712723, + "learning_rate": 3.823664110551166e-06, + "loss": 0.1868, + "num_tokens": 3578391121.0, + "step": 4692 + }, + { + "epoch": 6.40284638281075, + "grad_norm": 0.2386341321048807, + "learning_rate": 3.820665332854308e-06, + "loss": 0.1746, + "num_tokens": 3579136786.0, + "step": 4693 + }, + { + "epoch": 6.404211963769431, + "grad_norm": 0.23239388035017836, + "learning_rate": 3.817668745261544e-06, + "loss": 0.1826, + "num_tokens": 3579886292.0, + "step": 4694 + }, + { + "epoch": 6.405577544728111, + "grad_norm": 0.2238964726566911, + "learning_rate": 3.814674348687005e-06, + "loss": 0.184, + "num_tokens": 3580696816.0, + "step": 4695 + }, + { + "epoch": 6.406943125686791, + "grad_norm": 0.23050576416839857, + "learning_rate": 3.811682144044155e-06, + "loss": 0.1779, + "num_tokens": 3581384894.0, + "step": 4696 + }, + { + "epoch": 6.408308706645472, + "grad_norm": 0.23314450347895815, + "learning_rate": 3.808692132245779e-06, + "loss": 0.1836, + "num_tokens": 3582141568.0, + "step": 4697 + }, + { + "epoch": 6.409674287604152, + "grad_norm": 0.22965775427780133, + "learning_rate": 3.8057043142040083e-06, + "loss": 0.1863, + "num_tokens": 3582908066.0, + "step": 4698 + }, + { + "epoch": 6.411039868562833, + "grad_norm": 0.2200206833706499, + "learning_rate": 3.8027186908302916e-06, + "loss": 0.1758, + "num_tokens": 3583619650.0, + "step": 4699 + }, + { + "epoch": 6.412405449521513, + "grad_norm": 0.2157339799837985, + "learning_rate": 3.7997352630354135e-06, + "loss": 0.1845, + "num_tokens": 3584395909.0, + "step": 4700 + }, + { + "epoch": 6.413771030480194, + "grad_norm": 0.2291752961617363, + "learning_rate": 3.7967540317294927e-06, + "loss": 0.1838, + "num_tokens": 3585119871.0, + "step": 4701 + }, + { + "epoch": 6.415136611438874, + "grad_norm": 0.22632143635683327, + "learning_rate": 3.7937749978219686e-06, + "loss": 0.18, + "num_tokens": 3585897030.0, + "step": 4702 + }, + { + "epoch": 6.416502192397555, + "grad_norm": 0.21145996806642645, + "learning_rate": 3.790798162221616e-06, + "loss": 0.1792, + "num_tokens": 3586670896.0, + "step": 4703 + }, + { + "epoch": 6.417867773356235, + "grad_norm": 0.21515669171450472, + "learning_rate": 3.787823525836542e-06, + "loss": 0.18, + "num_tokens": 3587366213.0, + "step": 4704 + }, + { + "epoch": 6.419233354314915, + "grad_norm": 0.23557383906712354, + "learning_rate": 3.7848510895741785e-06, + "loss": 0.1812, + "num_tokens": 3588061765.0, + "step": 4705 + }, + { + "epoch": 6.420598935273596, + "grad_norm": 0.21863454394723125, + "learning_rate": 3.7818808543412822e-06, + "loss": 0.1843, + "num_tokens": 3588956382.0, + "step": 4706 + }, + { + "epoch": 6.4219645162322765, + "grad_norm": 0.20835101755498803, + "learning_rate": 3.778912821043952e-06, + "loss": 0.1773, + "num_tokens": 3589758235.0, + "step": 4707 + }, + { + "epoch": 6.4233300971909575, + "grad_norm": 0.2057063836827315, + "learning_rate": 3.775946990587598e-06, + "loss": 0.1793, + "num_tokens": 3590468603.0, + "step": 4708 + }, + { + "epoch": 6.424695678149638, + "grad_norm": 0.24437615874414237, + "learning_rate": 3.772983363876971e-06, + "loss": 0.181, + "num_tokens": 3591286349.0, + "step": 4709 + }, + { + "epoch": 6.426061259108319, + "grad_norm": 0.2046744450519949, + "learning_rate": 3.7700219418161465e-06, + "loss": 0.1797, + "num_tokens": 3592059649.0, + "step": 4710 + }, + { + "epoch": 6.427426840066999, + "grad_norm": 0.22770072203107586, + "learning_rate": 3.7670627253085213e-06, + "loss": 0.1826, + "num_tokens": 3592828068.0, + "step": 4711 + }, + { + "epoch": 6.42879242102568, + "grad_norm": 0.23003097529615302, + "learning_rate": 3.7641057152568257e-06, + "loss": 0.1821, + "num_tokens": 3593538463.0, + "step": 4712 + }, + { + "epoch": 6.43015800198436, + "grad_norm": 0.22621569277207304, + "learning_rate": 3.7611509125631176e-06, + "loss": 0.1778, + "num_tokens": 3594261662.0, + "step": 4713 + }, + { + "epoch": 6.43152358294304, + "grad_norm": 0.27065122093530547, + "learning_rate": 3.7581983181287773e-06, + "loss": 0.1803, + "num_tokens": 3594982771.0, + "step": 4714 + }, + { + "epoch": 6.432889163901721, + "grad_norm": 0.22887813412352165, + "learning_rate": 3.755247932854515e-06, + "loss": 0.1826, + "num_tokens": 3595740416.0, + "step": 4715 + }, + { + "epoch": 6.434254744860401, + "grad_norm": 0.22332439976619603, + "learning_rate": 3.752299757640367e-06, + "loss": 0.1758, + "num_tokens": 3596513326.0, + "step": 4716 + }, + { + "epoch": 6.435620325819082, + "grad_norm": 0.22625461250911724, + "learning_rate": 3.7493537933856893e-06, + "loss": 0.1748, + "num_tokens": 3597320666.0, + "step": 4717 + }, + { + "epoch": 6.436985906777762, + "grad_norm": 0.22496017436071017, + "learning_rate": 3.7464100409891717e-06, + "loss": 0.1845, + "num_tokens": 3598039515.0, + "step": 4718 + }, + { + "epoch": 6.438351487736443, + "grad_norm": 0.23505265823779417, + "learning_rate": 3.7434685013488253e-06, + "loss": 0.1686, + "num_tokens": 3598730197.0, + "step": 4719 + }, + { + "epoch": 6.439717068695123, + "grad_norm": 0.21157946999343086, + "learning_rate": 3.7405291753619834e-06, + "loss": 0.1803, + "num_tokens": 3599487307.0, + "step": 4720 + }, + { + "epoch": 6.441082649653804, + "grad_norm": 0.22476592023972347, + "learning_rate": 3.7375920639253095e-06, + "loss": 0.1804, + "num_tokens": 3600270378.0, + "step": 4721 + }, + { + "epoch": 6.442448230612484, + "grad_norm": 0.21813437565139515, + "learning_rate": 3.7346571679347887e-06, + "loss": 0.1768, + "num_tokens": 3601058466.0, + "step": 4722 + }, + { + "epoch": 6.4438138115711645, + "grad_norm": 0.21879363477045952, + "learning_rate": 3.7317244882857297e-06, + "loss": 0.1783, + "num_tokens": 3601839270.0, + "step": 4723 + }, + { + "epoch": 6.4451793925298455, + "grad_norm": 0.22349629387091144, + "learning_rate": 3.7287940258727684e-06, + "loss": 0.1757, + "num_tokens": 3602527181.0, + "step": 4724 + }, + { + "epoch": 6.446544973488526, + "grad_norm": 0.23320098874774634, + "learning_rate": 3.725865781589862e-06, + "loss": 0.1874, + "num_tokens": 3603318558.0, + "step": 4725 + }, + { + "epoch": 6.447910554447207, + "grad_norm": 0.22515763662257524, + "learning_rate": 3.722939756330287e-06, + "loss": 0.1805, + "num_tokens": 3604040605.0, + "step": 4726 + }, + { + "epoch": 6.449276135405887, + "grad_norm": 0.23292153569376944, + "learning_rate": 3.7200159509866507e-06, + "loss": 0.1877, + "num_tokens": 3604834282.0, + "step": 4727 + }, + { + "epoch": 6.450641716364568, + "grad_norm": 0.22260454453137682, + "learning_rate": 3.717094366450881e-06, + "loss": 0.178, + "num_tokens": 3605558682.0, + "step": 4728 + }, + { + "epoch": 6.452007297323248, + "grad_norm": 0.2282820266555694, + "learning_rate": 3.714175003614221e-06, + "loss": 0.181, + "num_tokens": 3606373347.0, + "step": 4729 + }, + { + "epoch": 6.453372878281929, + "grad_norm": 0.229244625192717, + "learning_rate": 3.7112578633672456e-06, + "loss": 0.1867, + "num_tokens": 3607172486.0, + "step": 4730 + }, + { + "epoch": 6.454738459240609, + "grad_norm": 0.22302732212756482, + "learning_rate": 3.708342946599847e-06, + "loss": 0.1804, + "num_tokens": 3607940230.0, + "step": 4731 + }, + { + "epoch": 6.456104040199289, + "grad_norm": 0.2144303700158582, + "learning_rate": 3.7054302542012423e-06, + "loss": 0.1787, + "num_tokens": 3608714880.0, + "step": 4732 + }, + { + "epoch": 6.45746962115797, + "grad_norm": 0.2234023605258637, + "learning_rate": 3.702519787059966e-06, + "loss": 0.1865, + "num_tokens": 3609574849.0, + "step": 4733 + }, + { + "epoch": 6.45883520211665, + "grad_norm": 0.2063112142664421, + "learning_rate": 3.699611546063875e-06, + "loss": 0.1835, + "num_tokens": 3610230051.0, + "step": 4734 + }, + { + "epoch": 6.460200783075331, + "grad_norm": 0.2399048575424592, + "learning_rate": 3.6967055321001475e-06, + "loss": 0.1857, + "num_tokens": 3610990679.0, + "step": 4735 + }, + { + "epoch": 6.461566364034011, + "grad_norm": 0.2318328853161293, + "learning_rate": 3.6938017460552844e-06, + "loss": 0.1878, + "num_tokens": 3611700606.0, + "step": 4736 + }, + { + "epoch": 6.462931944992692, + "grad_norm": 0.2217707669562879, + "learning_rate": 3.6909001888151052e-06, + "loss": 0.1779, + "num_tokens": 3612447433.0, + "step": 4737 + }, + { + "epoch": 6.464297525951372, + "grad_norm": 0.2252814826519996, + "learning_rate": 3.6880008612647466e-06, + "loss": 0.1752, + "num_tokens": 3613188646.0, + "step": 4738 + }, + { + "epoch": 6.465663106910053, + "grad_norm": 0.2175655692944809, + "learning_rate": 3.685103764288669e-06, + "loss": 0.1766, + "num_tokens": 3613943320.0, + "step": 4739 + }, + { + "epoch": 6.4670286878687335, + "grad_norm": 0.22168190810086927, + "learning_rate": 3.682208898770654e-06, + "loss": 0.1793, + "num_tokens": 3614720354.0, + "step": 4740 + }, + { + "epoch": 6.468394268827414, + "grad_norm": 0.2141865334257175, + "learning_rate": 3.6793162655937926e-06, + "loss": 0.18, + "num_tokens": 3615558618.0, + "step": 4741 + }, + { + "epoch": 6.469759849786095, + "grad_norm": 0.21007726052422543, + "learning_rate": 3.676425865640512e-06, + "loss": 0.1714, + "num_tokens": 3616309017.0, + "step": 4742 + }, + { + "epoch": 6.471125430744775, + "grad_norm": 0.21408981407228747, + "learning_rate": 3.6735376997925405e-06, + "loss": 0.1792, + "num_tokens": 3617083777.0, + "step": 4743 + }, + { + "epoch": 6.472491011703456, + "grad_norm": 0.21169944794336004, + "learning_rate": 3.670651768930934e-06, + "loss": 0.1829, + "num_tokens": 3617860559.0, + "step": 4744 + }, + { + "epoch": 6.473856592662136, + "grad_norm": 0.2313803249364492, + "learning_rate": 3.6677680739360654e-06, + "loss": 0.1768, + "num_tokens": 3618579668.0, + "step": 4745 + }, + { + "epoch": 6.475222173620817, + "grad_norm": 0.2193264049504872, + "learning_rate": 3.664886615687629e-06, + "loss": 0.1768, + "num_tokens": 3619314303.0, + "step": 4746 + }, + { + "epoch": 6.476587754579497, + "grad_norm": 0.22900706638391982, + "learning_rate": 3.662007395064624e-06, + "loss": 0.1747, + "num_tokens": 3620108500.0, + "step": 4747 + }, + { + "epoch": 6.477953335538178, + "grad_norm": 0.2179348536458725, + "learning_rate": 3.6591304129453862e-06, + "loss": 0.1801, + "num_tokens": 3620834478.0, + "step": 4748 + }, + { + "epoch": 6.479318916496858, + "grad_norm": 0.2140647025247064, + "learning_rate": 3.656255670207554e-06, + "loss": 0.1827, + "num_tokens": 3621583313.0, + "step": 4749 + }, + { + "epoch": 6.480684497455538, + "grad_norm": 0.22938863277652466, + "learning_rate": 3.653383167728081e-06, + "loss": 0.1742, + "num_tokens": 3622376276.0, + "step": 4750 + }, + { + "epoch": 6.482050078414219, + "grad_norm": 0.21493211754686253, + "learning_rate": 3.6505129063832532e-06, + "loss": 0.179, + "num_tokens": 3623135906.0, + "step": 4751 + }, + { + "epoch": 6.483415659372899, + "grad_norm": 0.2120222351936783, + "learning_rate": 3.647644887048656e-06, + "loss": 0.1822, + "num_tokens": 3623972103.0, + "step": 4752 + }, + { + "epoch": 6.48478124033158, + "grad_norm": 0.2234264337450395, + "learning_rate": 3.6447791105992005e-06, + "loss": 0.1844, + "num_tokens": 3624740520.0, + "step": 4753 + }, + { + "epoch": 6.48614682129026, + "grad_norm": 0.2099620188238082, + "learning_rate": 3.6419155779091098e-06, + "loss": 0.1772, + "num_tokens": 3625508282.0, + "step": 4754 + }, + { + "epoch": 6.487512402248941, + "grad_norm": 0.21825852464548642, + "learning_rate": 3.639054289851929e-06, + "loss": 0.1785, + "num_tokens": 3626263570.0, + "step": 4755 + }, + { + "epoch": 6.4888779832076215, + "grad_norm": 0.22483304412252836, + "learning_rate": 3.636195247300505e-06, + "loss": 0.1871, + "num_tokens": 3627041910.0, + "step": 4756 + }, + { + "epoch": 6.4902435641663025, + "grad_norm": 0.2279487423006822, + "learning_rate": 3.6333384511270124e-06, + "loss": 0.1722, + "num_tokens": 3627798121.0, + "step": 4757 + }, + { + "epoch": 6.491609145124983, + "grad_norm": 0.2156216861389156, + "learning_rate": 3.630483902202938e-06, + "loss": 0.182, + "num_tokens": 3628617831.0, + "step": 4758 + }, + { + "epoch": 6.492974726083663, + "grad_norm": 0.22699299600406742, + "learning_rate": 3.627631601399073e-06, + "loss": 0.1753, + "num_tokens": 3629336062.0, + "step": 4759 + }, + { + "epoch": 6.494340307042344, + "grad_norm": 0.22423566268388165, + "learning_rate": 3.624781549585542e-06, + "loss": 0.1851, + "num_tokens": 3630136894.0, + "step": 4760 + }, + { + "epoch": 6.495705888001024, + "grad_norm": 0.23548155388779976, + "learning_rate": 3.6219337476317635e-06, + "loss": 0.1717, + "num_tokens": 3630831243.0, + "step": 4761 + }, + { + "epoch": 6.497071468959705, + "grad_norm": 0.2306729069560012, + "learning_rate": 3.619088196406483e-06, + "loss": 0.1701, + "num_tokens": 3631583309.0, + "step": 4762 + }, + { + "epoch": 6.498437049918385, + "grad_norm": 0.22086649154944943, + "learning_rate": 3.616244896777752e-06, + "loss": 0.186, + "num_tokens": 3632375695.0, + "step": 4763 + }, + { + "epoch": 6.499802630877066, + "grad_norm": 0.22228069400480052, + "learning_rate": 3.613403849612942e-06, + "loss": 0.1814, + "num_tokens": 3633109951.0, + "step": 4764 + }, + { + "epoch": 6.501168211835746, + "grad_norm": 0.22585595825914953, + "learning_rate": 3.6105650557787297e-06, + "loss": 0.1774, + "num_tokens": 3633861680.0, + "step": 4765 + }, + { + "epoch": 6.502533792794427, + "grad_norm": 0.23165844650568887, + "learning_rate": 3.6077285161411096e-06, + "loss": 0.1871, + "num_tokens": 3634606790.0, + "step": 4766 + }, + { + "epoch": 6.503899373753107, + "grad_norm": 0.21983394716272808, + "learning_rate": 3.60489423156539e-06, + "loss": 0.1805, + "num_tokens": 3635356600.0, + "step": 4767 + }, + { + "epoch": 6.505264954711787, + "grad_norm": 0.22320113939688455, + "learning_rate": 3.60206220291618e-06, + "loss": 0.177, + "num_tokens": 3636035498.0, + "step": 4768 + }, + { + "epoch": 6.506630535670468, + "grad_norm": 0.22327898809139804, + "learning_rate": 3.599232431057419e-06, + "loss": 0.1837, + "num_tokens": 3636799787.0, + "step": 4769 + }, + { + "epoch": 6.5079961166291485, + "grad_norm": 0.2282535912065186, + "learning_rate": 3.5964049168523418e-06, + "loss": 0.1822, + "num_tokens": 3637565458.0, + "step": 4770 + }, + { + "epoch": 6.5093616975878295, + "grad_norm": 0.21515331770538096, + "learning_rate": 3.593579661163502e-06, + "loss": 0.1884, + "num_tokens": 3638317227.0, + "step": 4771 + }, + { + "epoch": 6.51072727854651, + "grad_norm": 0.23533333065453246, + "learning_rate": 3.5907566648527647e-06, + "loss": 0.1819, + "num_tokens": 3639077023.0, + "step": 4772 + }, + { + "epoch": 6.512092859505191, + "grad_norm": 0.22806343141940338, + "learning_rate": 3.5879359287812997e-06, + "loss": 0.1782, + "num_tokens": 3639882942.0, + "step": 4773 + }, + { + "epoch": 6.513458440463871, + "grad_norm": 0.20767436332857178, + "learning_rate": 3.5851174538095923e-06, + "loss": 0.1829, + "num_tokens": 3640609825.0, + "step": 4774 + }, + { + "epoch": 6.514824021422552, + "grad_norm": 0.22926492261394626, + "learning_rate": 3.5823012407974407e-06, + "loss": 0.1807, + "num_tokens": 3641366935.0, + "step": 4775 + }, + { + "epoch": 6.516189602381232, + "grad_norm": 0.2258133987110664, + "learning_rate": 3.579487290603949e-06, + "loss": 0.175, + "num_tokens": 3642090363.0, + "step": 4776 + }, + { + "epoch": 6.517555183339912, + "grad_norm": 0.2265875394434879, + "learning_rate": 3.576675604087524e-06, + "loss": 0.1808, + "num_tokens": 3642852870.0, + "step": 4777 + }, + { + "epoch": 6.518920764298593, + "grad_norm": 0.22829008343580473, + "learning_rate": 3.5738661821059004e-06, + "loss": 0.1849, + "num_tokens": 3643649150.0, + "step": 4778 + }, + { + "epoch": 6.520286345257273, + "grad_norm": 0.2210068233585175, + "learning_rate": 3.5710590255161047e-06, + "loss": 0.1829, + "num_tokens": 3644368437.0, + "step": 4779 + }, + { + "epoch": 6.521651926215954, + "grad_norm": 0.22723059250228148, + "learning_rate": 3.5682541351744794e-06, + "loss": 0.1844, + "num_tokens": 3645072397.0, + "step": 4780 + }, + { + "epoch": 6.523017507174634, + "grad_norm": 0.22255589593864541, + "learning_rate": 3.5654515119366795e-06, + "loss": 0.1805, + "num_tokens": 3645817812.0, + "step": 4781 + }, + { + "epoch": 6.524383088133315, + "grad_norm": 0.21064919171390636, + "learning_rate": 3.5626511566576568e-06, + "loss": 0.1917, + "num_tokens": 3646607288.0, + "step": 4782 + }, + { + "epoch": 6.525748669091995, + "grad_norm": 0.23858110980550917, + "learning_rate": 3.559853070191682e-06, + "loss": 0.1843, + "num_tokens": 3647410401.0, + "step": 4783 + }, + { + "epoch": 6.527114250050676, + "grad_norm": 0.23223087139199772, + "learning_rate": 3.557057253392331e-06, + "loss": 0.1862, + "num_tokens": 3648146124.0, + "step": 4784 + }, + { + "epoch": 6.528479831009356, + "grad_norm": 0.22471123202484494, + "learning_rate": 3.554263707112487e-06, + "loss": 0.1871, + "num_tokens": 3648964007.0, + "step": 4785 + }, + { + "epoch": 6.5298454119680365, + "grad_norm": 0.23695961626303033, + "learning_rate": 3.551472432204335e-06, + "loss": 0.1782, + "num_tokens": 3649729878.0, + "step": 4786 + }, + { + "epoch": 6.5312109929267175, + "grad_norm": 0.22391162910446552, + "learning_rate": 3.5486834295193785e-06, + "loss": 0.1773, + "num_tokens": 3650463165.0, + "step": 4787 + }, + { + "epoch": 6.532576573885398, + "grad_norm": 0.2232085517123663, + "learning_rate": 3.545896699908417e-06, + "loss": 0.1795, + "num_tokens": 3651201748.0, + "step": 4788 + }, + { + "epoch": 6.533942154844079, + "grad_norm": 0.23691144618455381, + "learning_rate": 3.543112244221564e-06, + "loss": 0.1888, + "num_tokens": 3652092189.0, + "step": 4789 + }, + { + "epoch": 6.535307735802759, + "grad_norm": 0.22490600982905473, + "learning_rate": 3.540330063308235e-06, + "loss": 0.1831, + "num_tokens": 3652873036.0, + "step": 4790 + }, + { + "epoch": 6.53667331676144, + "grad_norm": 0.22435757219598473, + "learning_rate": 3.5375501580171516e-06, + "loss": 0.1839, + "num_tokens": 3653637831.0, + "step": 4791 + }, + { + "epoch": 6.53803889772012, + "grad_norm": 0.2155973791504615, + "learning_rate": 3.534772529196344e-06, + "loss": 0.1833, + "num_tokens": 3654425575.0, + "step": 4792 + }, + { + "epoch": 6.539404478678801, + "grad_norm": 0.23530158774510448, + "learning_rate": 3.5319971776931466e-06, + "loss": 0.18, + "num_tokens": 3655244904.0, + "step": 4793 + }, + { + "epoch": 6.540770059637481, + "grad_norm": 0.20648889007628898, + "learning_rate": 3.5292241043542e-06, + "loss": 0.1796, + "num_tokens": 3656016792.0, + "step": 4794 + }, + { + "epoch": 6.542135640596161, + "grad_norm": 0.22104453701631827, + "learning_rate": 3.526453310025445e-06, + "loss": 0.186, + "num_tokens": 3656772482.0, + "step": 4795 + }, + { + "epoch": 6.543501221554842, + "grad_norm": 0.22926923527423118, + "learning_rate": 3.5236847955521336e-06, + "loss": 0.1871, + "num_tokens": 3657523431.0, + "step": 4796 + }, + { + "epoch": 6.544866802513522, + "grad_norm": 0.22803608821047552, + "learning_rate": 3.52091856177882e-06, + "loss": 0.1835, + "num_tokens": 3658304464.0, + "step": 4797 + }, + { + "epoch": 6.546232383472203, + "grad_norm": 0.2887884097685323, + "learning_rate": 3.5181546095493615e-06, + "loss": 0.187, + "num_tokens": 3659140542.0, + "step": 4798 + }, + { + "epoch": 6.547597964430883, + "grad_norm": 0.21443720294582588, + "learning_rate": 3.515392939706922e-06, + "loss": 0.1885, + "num_tokens": 3659934477.0, + "step": 4799 + }, + { + "epoch": 6.548963545389564, + "grad_norm": 0.21146978928843968, + "learning_rate": 3.5126335530939636e-06, + "loss": 0.1837, + "num_tokens": 3660760644.0, + "step": 4800 + }, + { + "epoch": 6.550329126348244, + "grad_norm": 0.21727373971762395, + "learning_rate": 3.5098764505522582e-06, + "loss": 0.1808, + "num_tokens": 3661503341.0, + "step": 4801 + }, + { + "epoch": 6.551694707306925, + "grad_norm": 0.2356568973728017, + "learning_rate": 3.5071216329228784e-06, + "loss": 0.1774, + "num_tokens": 3662262489.0, + "step": 4802 + }, + { + "epoch": 6.5530602882656055, + "grad_norm": 0.22515861563110076, + "learning_rate": 3.5043691010461996e-06, + "loss": 0.18, + "num_tokens": 3662991156.0, + "step": 4803 + }, + { + "epoch": 6.554425869224286, + "grad_norm": 0.20769287043248816, + "learning_rate": 3.5016188557619014e-06, + "loss": 0.1807, + "num_tokens": 3663740009.0, + "step": 4804 + }, + { + "epoch": 6.555791450182967, + "grad_norm": 0.23436581245797553, + "learning_rate": 3.4988708979089607e-06, + "loss": 0.183, + "num_tokens": 3664588188.0, + "step": 4805 + }, + { + "epoch": 6.557157031141647, + "grad_norm": 0.21522648493837862, + "learning_rate": 3.496125228325663e-06, + "loss": 0.1808, + "num_tokens": 3665370956.0, + "step": 4806 + }, + { + "epoch": 6.558522612100328, + "grad_norm": 0.23170779984980064, + "learning_rate": 3.493381847849593e-06, + "loss": 0.1793, + "num_tokens": 3666065625.0, + "step": 4807 + }, + { + "epoch": 6.559888193059008, + "grad_norm": 0.23184877641013862, + "learning_rate": 3.4906407573176395e-06, + "loss": 0.1855, + "num_tokens": 3666795284.0, + "step": 4808 + }, + { + "epoch": 6.561253774017689, + "grad_norm": 0.21595031908544562, + "learning_rate": 3.487901957565986e-06, + "loss": 0.1774, + "num_tokens": 3667517081.0, + "step": 4809 + }, + { + "epoch": 6.562619354976369, + "grad_norm": 0.22509699032226332, + "learning_rate": 3.485165449430124e-06, + "loss": 0.1861, + "num_tokens": 3668306469.0, + "step": 4810 + }, + { + "epoch": 6.56398493593505, + "grad_norm": 0.21343485412770635, + "learning_rate": 3.482431233744846e-06, + "loss": 0.182, + "num_tokens": 3669123553.0, + "step": 4811 + }, + { + "epoch": 6.56535051689373, + "grad_norm": 0.21767751706702304, + "learning_rate": 3.4796993113442355e-06, + "loss": 0.1829, + "num_tokens": 3669894804.0, + "step": 4812 + }, + { + "epoch": 6.56671609785241, + "grad_norm": 0.21331809850934497, + "learning_rate": 3.476969683061694e-06, + "loss": 0.1819, + "num_tokens": 3670699111.0, + "step": 4813 + }, + { + "epoch": 6.568081678811091, + "grad_norm": 0.21121591440758444, + "learning_rate": 3.474242349729904e-06, + "loss": 0.1787, + "num_tokens": 3671498463.0, + "step": 4814 + }, + { + "epoch": 6.569447259769771, + "grad_norm": 0.2279823790914049, + "learning_rate": 3.47151731218086e-06, + "loss": 0.1898, + "num_tokens": 3672329311.0, + "step": 4815 + }, + { + "epoch": 6.570812840728452, + "grad_norm": 0.2061356450958764, + "learning_rate": 3.4687945712458543e-06, + "loss": 0.1899, + "num_tokens": 3673106078.0, + "step": 4816 + }, + { + "epoch": 6.572178421687132, + "grad_norm": 0.22411591340123, + "learning_rate": 3.4660741277554776e-06, + "loss": 0.1931, + "num_tokens": 3673862511.0, + "step": 4817 + }, + { + "epoch": 6.573544002645813, + "grad_norm": 0.22078619941856764, + "learning_rate": 3.4633559825396167e-06, + "loss": 0.1795, + "num_tokens": 3674651619.0, + "step": 4818 + }, + { + "epoch": 6.5749095836044935, + "grad_norm": 0.2848154979272138, + "learning_rate": 3.46064013642746e-06, + "loss": 0.1741, + "num_tokens": 3675421560.0, + "step": 4819 + }, + { + "epoch": 6.5762751645631745, + "grad_norm": 0.21447635245095573, + "learning_rate": 3.457926590247499e-06, + "loss": 0.1816, + "num_tokens": 3676168927.0, + "step": 4820 + }, + { + "epoch": 6.577640745521855, + "grad_norm": 0.23951090040235246, + "learning_rate": 3.455215344827511e-06, + "loss": 0.1756, + "num_tokens": 3676945516.0, + "step": 4821 + }, + { + "epoch": 6.579006326480535, + "grad_norm": 0.2069201234309381, + "learning_rate": 3.452506400994588e-06, + "loss": 0.1739, + "num_tokens": 3677747801.0, + "step": 4822 + }, + { + "epoch": 6.580371907439216, + "grad_norm": 0.21764044633346558, + "learning_rate": 3.449799759575107e-06, + "loss": 0.181, + "num_tokens": 3678562736.0, + "step": 4823 + }, + { + "epoch": 6.581737488397896, + "grad_norm": 0.21737917694850611, + "learning_rate": 3.4470954213947472e-06, + "loss": 0.1895, + "num_tokens": 3679339737.0, + "step": 4824 + }, + { + "epoch": 6.583103069356577, + "grad_norm": 0.22831936347462248, + "learning_rate": 3.444393387278486e-06, + "loss": 0.1856, + "num_tokens": 3680147407.0, + "step": 4825 + }, + { + "epoch": 6.584468650315257, + "grad_norm": 0.20694477243610473, + "learning_rate": 3.441693658050599e-06, + "loss": 0.1803, + "num_tokens": 3680920441.0, + "step": 4826 + }, + { + "epoch": 6.585834231273938, + "grad_norm": 0.2223391131817523, + "learning_rate": 3.438996234534654e-06, + "loss": 0.171, + "num_tokens": 3681649657.0, + "step": 4827 + }, + { + "epoch": 6.587199812232618, + "grad_norm": 0.21882424363083458, + "learning_rate": 3.4363011175535167e-06, + "loss": 0.1857, + "num_tokens": 3682381101.0, + "step": 4828 + }, + { + "epoch": 6.588565393191299, + "grad_norm": 0.2470869020974388, + "learning_rate": 3.433608307929354e-06, + "loss": 0.1854, + "num_tokens": 3683183073.0, + "step": 4829 + }, + { + "epoch": 6.589930974149979, + "grad_norm": 0.21654690296937112, + "learning_rate": 3.430917806483621e-06, + "loss": 0.1775, + "num_tokens": 3683941267.0, + "step": 4830 + }, + { + "epoch": 6.591296555108659, + "grad_norm": 0.21443759710205532, + "learning_rate": 3.4282296140370807e-06, + "loss": 0.1806, + "num_tokens": 3684741206.0, + "step": 4831 + }, + { + "epoch": 6.59266213606734, + "grad_norm": 0.21034103038296753, + "learning_rate": 3.425543731409777e-06, + "loss": 0.1758, + "num_tokens": 3685479790.0, + "step": 4832 + }, + { + "epoch": 6.59402771702602, + "grad_norm": 0.22911965372523307, + "learning_rate": 3.4228601594210587e-06, + "loss": 0.1832, + "num_tokens": 3686224594.0, + "step": 4833 + }, + { + "epoch": 6.595393297984701, + "grad_norm": 0.22110409450488314, + "learning_rate": 3.420178898889571e-06, + "loss": 0.1751, + "num_tokens": 3686971812.0, + "step": 4834 + }, + { + "epoch": 6.5967588789433815, + "grad_norm": 0.2477610221994168, + "learning_rate": 3.4174999506332434e-06, + "loss": 0.1851, + "num_tokens": 3687677318.0, + "step": 4835 + }, + { + "epoch": 6.5981244599020625, + "grad_norm": 0.21513196922143643, + "learning_rate": 3.4148233154693116e-06, + "loss": 0.1805, + "num_tokens": 3688487786.0, + "step": 4836 + }, + { + "epoch": 6.599490040860743, + "grad_norm": 0.21222539739223864, + "learning_rate": 3.4121489942143006e-06, + "loss": 0.1871, + "num_tokens": 3689217720.0, + "step": 4837 + }, + { + "epoch": 6.600855621819424, + "grad_norm": 0.23404660271559805, + "learning_rate": 3.4094769876840305e-06, + "loss": 0.1809, + "num_tokens": 3689963805.0, + "step": 4838 + }, + { + "epoch": 6.602221202778104, + "grad_norm": 0.22709276811637877, + "learning_rate": 3.4068072966936106e-06, + "loss": 0.1776, + "num_tokens": 3690700761.0, + "step": 4839 + }, + { + "epoch": 6.603586783736784, + "grad_norm": 0.22998078455947865, + "learning_rate": 3.4041399220574554e-06, + "loss": 0.1791, + "num_tokens": 3691429771.0, + "step": 4840 + }, + { + "epoch": 6.604952364695465, + "grad_norm": 0.22975225226587087, + "learning_rate": 3.401474864589258e-06, + "loss": 0.1828, + "num_tokens": 3692150449.0, + "step": 4841 + }, + { + "epoch": 6.606317945654145, + "grad_norm": 0.23009457839580627, + "learning_rate": 3.398812125102017e-06, + "loss": 0.1876, + "num_tokens": 3692970592.0, + "step": 4842 + }, + { + "epoch": 6.607683526612826, + "grad_norm": 0.2250966665782744, + "learning_rate": 3.3961517044080182e-06, + "loss": 0.1761, + "num_tokens": 3693687446.0, + "step": 4843 + }, + { + "epoch": 6.609049107571506, + "grad_norm": 0.22360865049426387, + "learning_rate": 3.393493603318838e-06, + "loss": 0.1757, + "num_tokens": 3694429415.0, + "step": 4844 + }, + { + "epoch": 6.610414688530187, + "grad_norm": 0.24429494406894797, + "learning_rate": 3.3908378226453497e-06, + "loss": 0.1803, + "num_tokens": 3695137197.0, + "step": 4845 + }, + { + "epoch": 6.611780269488867, + "grad_norm": 0.22566137214324625, + "learning_rate": 3.3881843631977163e-06, + "loss": 0.1725, + "num_tokens": 3695880173.0, + "step": 4846 + }, + { + "epoch": 6.613145850447548, + "grad_norm": 0.22954624379362149, + "learning_rate": 3.3855332257853966e-06, + "loss": 0.1813, + "num_tokens": 3696658450.0, + "step": 4847 + }, + { + "epoch": 6.614511431406228, + "grad_norm": 0.2278589311559936, + "learning_rate": 3.382884411217131e-06, + "loss": 0.1708, + "num_tokens": 3697385255.0, + "step": 4848 + }, + { + "epoch": 6.615877012364908, + "grad_norm": 0.21851124685806794, + "learning_rate": 3.3802379203009662e-06, + "loss": 0.1785, + "num_tokens": 3698167435.0, + "step": 4849 + }, + { + "epoch": 6.617242593323589, + "grad_norm": 0.2094659226249039, + "learning_rate": 3.377593753844228e-06, + "loss": 0.1804, + "num_tokens": 3698967758.0, + "step": 4850 + }, + { + "epoch": 6.6186081742822696, + "grad_norm": 0.23889354929098042, + "learning_rate": 3.3749519126535337e-06, + "loss": 0.1885, + "num_tokens": 3699754695.0, + "step": 4851 + }, + { + "epoch": 6.6199737552409506, + "grad_norm": 0.21781996486086538, + "learning_rate": 3.3723123975348016e-06, + "loss": 0.1876, + "num_tokens": 3700560364.0, + "step": 4852 + }, + { + "epoch": 6.621339336199631, + "grad_norm": 0.21839122768274152, + "learning_rate": 3.3696752092932283e-06, + "loss": 0.1892, + "num_tokens": 3701296715.0, + "step": 4853 + }, + { + "epoch": 6.622704917158312, + "grad_norm": 0.22697885916400812, + "learning_rate": 3.367040348733307e-06, + "loss": 0.1932, + "num_tokens": 3702122819.0, + "step": 4854 + }, + { + "epoch": 6.624070498116992, + "grad_norm": 0.22600928434059214, + "learning_rate": 3.36440781665882e-06, + "loss": 0.1741, + "num_tokens": 3702869826.0, + "step": 4855 + }, + { + "epoch": 6.625436079075673, + "grad_norm": 0.21862268586887315, + "learning_rate": 3.361777613872841e-06, + "loss": 0.1824, + "num_tokens": 3703667911.0, + "step": 4856 + }, + { + "epoch": 6.626801660034353, + "grad_norm": 0.21888720102719694, + "learning_rate": 3.359149741177724e-06, + "loss": 0.1715, + "num_tokens": 3704374127.0, + "step": 4857 + }, + { + "epoch": 6.628167240993033, + "grad_norm": 0.2405388945107942, + "learning_rate": 3.356524199375129e-06, + "loss": 0.1819, + "num_tokens": 3705185823.0, + "step": 4858 + }, + { + "epoch": 6.629532821951714, + "grad_norm": 0.2148834742173097, + "learning_rate": 3.3539009892659873e-06, + "loss": 0.187, + "num_tokens": 3705958395.0, + "step": 4859 + }, + { + "epoch": 6.630898402910394, + "grad_norm": 0.2299156825844622, + "learning_rate": 3.351280111650529e-06, + "loss": 0.183, + "num_tokens": 3706688539.0, + "step": 4860 + }, + { + "epoch": 6.632263983869075, + "grad_norm": 0.22129554969191337, + "learning_rate": 3.348661567328273e-06, + "loss": 0.1811, + "num_tokens": 3707515907.0, + "step": 4861 + }, + { + "epoch": 6.633629564827755, + "grad_norm": 0.2201366827231735, + "learning_rate": 3.34604535709802e-06, + "loss": 0.1783, + "num_tokens": 3708351392.0, + "step": 4862 + }, + { + "epoch": 6.634995145786436, + "grad_norm": 0.2104719653928465, + "learning_rate": 3.343431481757864e-06, + "loss": 0.1751, + "num_tokens": 3709123282.0, + "step": 4863 + }, + { + "epoch": 6.636360726745116, + "grad_norm": 0.22288152649292414, + "learning_rate": 3.3408199421051845e-06, + "loss": 0.1773, + "num_tokens": 3709912489.0, + "step": 4864 + }, + { + "epoch": 6.637726307703797, + "grad_norm": 0.21881221239134527, + "learning_rate": 3.3382107389366527e-06, + "loss": 0.1789, + "num_tokens": 3710640352.0, + "step": 4865 + }, + { + "epoch": 6.6390918886624775, + "grad_norm": 0.22941990949603128, + "learning_rate": 3.3356038730482176e-06, + "loss": 0.1776, + "num_tokens": 3711468154.0, + "step": 4866 + }, + { + "epoch": 6.640457469621158, + "grad_norm": 0.2093290750897343, + "learning_rate": 3.332999345235124e-06, + "loss": 0.1944, + "num_tokens": 3712280015.0, + "step": 4867 + }, + { + "epoch": 6.641823050579839, + "grad_norm": 0.24270798115826317, + "learning_rate": 3.3303971562919013e-06, + "loss": 0.1802, + "num_tokens": 3713061381.0, + "step": 4868 + }, + { + "epoch": 6.643188631538519, + "grad_norm": 0.2080679269332129, + "learning_rate": 3.327797307012363e-06, + "loss": 0.1861, + "num_tokens": 3713804399.0, + "step": 4869 + }, + { + "epoch": 6.6445542124972, + "grad_norm": 0.2251924331140791, + "learning_rate": 3.3251997981896145e-06, + "loss": 0.1918, + "num_tokens": 3714572796.0, + "step": 4870 + }, + { + "epoch": 6.64591979345588, + "grad_norm": 0.23024541709220717, + "learning_rate": 3.3226046306160376e-06, + "loss": 0.1821, + "num_tokens": 3715305150.0, + "step": 4871 + }, + { + "epoch": 6.647285374414561, + "grad_norm": 0.22236457696877102, + "learning_rate": 3.3200118050833086e-06, + "loss": 0.1839, + "num_tokens": 3716122203.0, + "step": 4872 + }, + { + "epoch": 6.648650955373241, + "grad_norm": 0.28999662633463075, + "learning_rate": 3.3174213223823878e-06, + "loss": 0.1727, + "num_tokens": 3716873064.0, + "step": 4873 + }, + { + "epoch": 6.650016536331922, + "grad_norm": 0.2219927818922372, + "learning_rate": 3.3148331833035154e-06, + "loss": 0.1705, + "num_tokens": 3717562519.0, + "step": 4874 + }, + { + "epoch": 6.651382117290602, + "grad_norm": 0.21511912392471874, + "learning_rate": 3.3122473886362215e-06, + "loss": 0.1776, + "num_tokens": 3718371766.0, + "step": 4875 + }, + { + "epoch": 6.652747698249282, + "grad_norm": 0.21992165096206584, + "learning_rate": 3.309663939169323e-06, + "loss": 0.1743, + "num_tokens": 3719163033.0, + "step": 4876 + }, + { + "epoch": 6.654113279207963, + "grad_norm": 0.22184704531161742, + "learning_rate": 3.3070828356909156e-06, + "loss": 0.1834, + "num_tokens": 3719898484.0, + "step": 4877 + }, + { + "epoch": 6.655478860166643, + "grad_norm": 0.22663813455177678, + "learning_rate": 3.304504078988382e-06, + "loss": 0.1843, + "num_tokens": 3720597694.0, + "step": 4878 + }, + { + "epoch": 6.656844441125324, + "grad_norm": 0.23465924342988512, + "learning_rate": 3.3019276698483937e-06, + "loss": 0.1748, + "num_tokens": 3721256361.0, + "step": 4879 + }, + { + "epoch": 6.658210022084004, + "grad_norm": 0.23632405772929593, + "learning_rate": 3.2993536090568955e-06, + "loss": 0.1803, + "num_tokens": 3722039498.0, + "step": 4880 + }, + { + "epoch": 6.659575603042685, + "grad_norm": 0.21761004777620957, + "learning_rate": 3.2967818973991243e-06, + "loss": 0.1786, + "num_tokens": 3722817309.0, + "step": 4881 + }, + { + "epoch": 6.6609411840013655, + "grad_norm": 0.2164026366514478, + "learning_rate": 3.294212535659601e-06, + "loss": 0.1755, + "num_tokens": 3723557858.0, + "step": 4882 + }, + { + "epoch": 6.6623067649600465, + "grad_norm": 0.22203646231572757, + "learning_rate": 3.291645524622122e-06, + "loss": 0.1814, + "num_tokens": 3724285837.0, + "step": 4883 + }, + { + "epoch": 6.663672345918727, + "grad_norm": 0.21987425810173392, + "learning_rate": 3.289080865069772e-06, + "loss": 0.1844, + "num_tokens": 3725080782.0, + "step": 4884 + }, + { + "epoch": 6.665037926877407, + "grad_norm": 0.21433224589759894, + "learning_rate": 3.286518557784919e-06, + "loss": 0.1767, + "num_tokens": 3725826676.0, + "step": 4885 + }, + { + "epoch": 6.666403507836088, + "grad_norm": 0.23151045301715995, + "learning_rate": 3.283958603549211e-06, + "loss": 0.1842, + "num_tokens": 3726582591.0, + "step": 4886 + }, + { + "epoch": 6.667769088794768, + "grad_norm": 0.21355575721727604, + "learning_rate": 3.28140100314358e-06, + "loss": 0.175, + "num_tokens": 3727341210.0, + "step": 4887 + }, + { + "epoch": 6.669134669753449, + "grad_norm": 0.457746435218773, + "learning_rate": 3.278845757348241e-06, + "loss": 0.1779, + "num_tokens": 3728081889.0, + "step": 4888 + }, + { + "epoch": 6.670500250712129, + "grad_norm": 0.21743916021630652, + "learning_rate": 3.2762928669426846e-06, + "loss": 0.1736, + "num_tokens": 3728818306.0, + "step": 4889 + }, + { + "epoch": 6.67186583167081, + "grad_norm": 0.21666552159982022, + "learning_rate": 3.2737423327056893e-06, + "loss": 0.1848, + "num_tokens": 3729539453.0, + "step": 4890 + }, + { + "epoch": 6.67323141262949, + "grad_norm": 0.2343700675532772, + "learning_rate": 3.271194155415315e-06, + "loss": 0.181, + "num_tokens": 3730260660.0, + "step": 4891 + }, + { + "epoch": 6.674596993588171, + "grad_norm": 0.22901107366790074, + "learning_rate": 3.268648335848893e-06, + "loss": 0.1837, + "num_tokens": 3731053538.0, + "step": 4892 + }, + { + "epoch": 6.675962574546851, + "grad_norm": 0.21706263409297086, + "learning_rate": 3.2661048747830515e-06, + "loss": 0.1734, + "num_tokens": 3731823775.0, + "step": 4893 + }, + { + "epoch": 6.677328155505531, + "grad_norm": 0.2169084414396973, + "learning_rate": 3.2635637729936843e-06, + "loss": 0.1731, + "num_tokens": 3732471064.0, + "step": 4894 + }, + { + "epoch": 6.678693736464212, + "grad_norm": 0.23074015924237617, + "learning_rate": 3.2610250312559734e-06, + "loss": 0.1798, + "num_tokens": 3733266382.0, + "step": 4895 + }, + { + "epoch": 6.680059317422892, + "grad_norm": 0.2122072872242158, + "learning_rate": 3.258488650344379e-06, + "loss": 0.1795, + "num_tokens": 3734064279.0, + "step": 4896 + }, + { + "epoch": 6.681424898381573, + "grad_norm": 0.22214411198649975, + "learning_rate": 3.2559546310326435e-06, + "loss": 0.1821, + "num_tokens": 3734864126.0, + "step": 4897 + }, + { + "epoch": 6.6827904793402535, + "grad_norm": 0.2348672180969931, + "learning_rate": 3.2534229740937807e-06, + "loss": 0.1828, + "num_tokens": 3735651652.0, + "step": 4898 + }, + { + "epoch": 6.6841560602989345, + "grad_norm": 0.20007910206312118, + "learning_rate": 3.2508936803000936e-06, + "loss": 0.18, + "num_tokens": 3736421919.0, + "step": 4899 + }, + { + "epoch": 6.685521641257615, + "grad_norm": 0.22028860878491907, + "learning_rate": 3.248366750423162e-06, + "loss": 0.1845, + "num_tokens": 3737214326.0, + "step": 4900 + }, + { + "epoch": 6.686887222216296, + "grad_norm": 0.24917825939736216, + "learning_rate": 3.2458421852338344e-06, + "loss": 0.1843, + "num_tokens": 3738018102.0, + "step": 4901 + }, + { + "epoch": 6.688252803174976, + "grad_norm": 0.22776364947627575, + "learning_rate": 3.2433199855022574e-06, + "loss": 0.1775, + "num_tokens": 3738812017.0, + "step": 4902 + }, + { + "epoch": 6.689618384133656, + "grad_norm": 0.22544626188054653, + "learning_rate": 3.240800151997837e-06, + "loss": 0.1748, + "num_tokens": 3739540111.0, + "step": 4903 + }, + { + "epoch": 6.690983965092337, + "grad_norm": 0.2232790580724294, + "learning_rate": 3.238282685489268e-06, + "loss": 0.1791, + "num_tokens": 3740274556.0, + "step": 4904 + }, + { + "epoch": 6.692349546051017, + "grad_norm": 0.2190671293041399, + "learning_rate": 3.2357675867445233e-06, + "loss": 0.1784, + "num_tokens": 3741075465.0, + "step": 4905 + }, + { + "epoch": 6.693715127009698, + "grad_norm": 0.22229548335275695, + "learning_rate": 3.2332548565308438e-06, + "loss": 0.1852, + "num_tokens": 3741915610.0, + "step": 4906 + }, + { + "epoch": 6.695080707968378, + "grad_norm": 0.227960190661624, + "learning_rate": 3.230744495614759e-06, + "loss": 0.1836, + "num_tokens": 3742706368.0, + "step": 4907 + }, + { + "epoch": 6.696446288927059, + "grad_norm": 0.22855204808196178, + "learning_rate": 3.228236504762071e-06, + "loss": 0.1846, + "num_tokens": 3743389555.0, + "step": 4908 + }, + { + "epoch": 6.697811869885739, + "grad_norm": 0.24568375496079206, + "learning_rate": 3.225730884737862e-06, + "loss": 0.1811, + "num_tokens": 3744121560.0, + "step": 4909 + }, + { + "epoch": 6.69917745084442, + "grad_norm": 0.22649848434791564, + "learning_rate": 3.2232276363064796e-06, + "loss": 0.176, + "num_tokens": 3744903609.0, + "step": 4910 + }, + { + "epoch": 6.7005430318031, + "grad_norm": 0.20954705797349746, + "learning_rate": 3.2207267602315665e-06, + "loss": 0.1836, + "num_tokens": 3745673216.0, + "step": 4911 + }, + { + "epoch": 6.70190861276178, + "grad_norm": 0.2267430939490457, + "learning_rate": 3.2182282572760263e-06, + "loss": 0.1794, + "num_tokens": 3746439228.0, + "step": 4912 + }, + { + "epoch": 6.703274193720461, + "grad_norm": 0.22025364390416519, + "learning_rate": 3.215732128202042e-06, + "loss": 0.1786, + "num_tokens": 3747191897.0, + "step": 4913 + }, + { + "epoch": 6.7046397746791415, + "grad_norm": 0.21664441611669805, + "learning_rate": 3.2132383737710814e-06, + "loss": 0.1799, + "num_tokens": 3747957549.0, + "step": 4914 + }, + { + "epoch": 6.7060053556378225, + "grad_norm": 0.23346116295938132, + "learning_rate": 3.2107469947438745e-06, + "loss": 0.1748, + "num_tokens": 3748693883.0, + "step": 4915 + }, + { + "epoch": 6.707370936596503, + "grad_norm": 0.2244498489269335, + "learning_rate": 3.2082579918804357e-06, + "loss": 0.1825, + "num_tokens": 3749499329.0, + "step": 4916 + }, + { + "epoch": 6.708736517555184, + "grad_norm": 0.21934399220757458, + "learning_rate": 3.205771365940052e-06, + "loss": 0.1863, + "num_tokens": 3750307157.0, + "step": 4917 + }, + { + "epoch": 6.710102098513864, + "grad_norm": 0.21174279938375234, + "learning_rate": 3.203287117681288e-06, + "loss": 0.1834, + "num_tokens": 3751108463.0, + "step": 4918 + }, + { + "epoch": 6.711467679472545, + "grad_norm": 0.21530145398478084, + "learning_rate": 3.200805247861972e-06, + "loss": 0.1764, + "num_tokens": 3751852071.0, + "step": 4919 + }, + { + "epoch": 6.712833260431225, + "grad_norm": 0.21039483344873874, + "learning_rate": 3.1983257572392255e-06, + "loss": 0.1788, + "num_tokens": 3752661377.0, + "step": 4920 + }, + { + "epoch": 6.714198841389905, + "grad_norm": 0.21971352421790485, + "learning_rate": 3.195848646569428e-06, + "loss": 0.18, + "num_tokens": 3753459166.0, + "step": 4921 + }, + { + "epoch": 6.715564422348586, + "grad_norm": 0.21281636608764773, + "learning_rate": 3.1933739166082366e-06, + "loss": 0.1851, + "num_tokens": 3754313391.0, + "step": 4922 + }, + { + "epoch": 6.716930003307266, + "grad_norm": 0.23176230163458247, + "learning_rate": 3.19090156811059e-06, + "loss": 0.1837, + "num_tokens": 3755051643.0, + "step": 4923 + }, + { + "epoch": 6.718295584265947, + "grad_norm": 0.23133142719492866, + "learning_rate": 3.1884316018306895e-06, + "loss": 0.1812, + "num_tokens": 3755840840.0, + "step": 4924 + }, + { + "epoch": 6.719661165224627, + "grad_norm": 0.2113225979647157, + "learning_rate": 3.1859640185220177e-06, + "loss": 0.1767, + "num_tokens": 3756641170.0, + "step": 4925 + }, + { + "epoch": 6.721026746183308, + "grad_norm": 0.20982620758816892, + "learning_rate": 3.183498818937326e-06, + "loss": 0.1782, + "num_tokens": 3757405074.0, + "step": 4926 + }, + { + "epoch": 6.722392327141988, + "grad_norm": 0.23403761838057588, + "learning_rate": 3.181036003828643e-06, + "loss": 0.1795, + "num_tokens": 3758130099.0, + "step": 4927 + }, + { + "epoch": 6.723757908100669, + "grad_norm": 0.21563686063944335, + "learning_rate": 3.1785755739472625e-06, + "loss": 0.1801, + "num_tokens": 3758891801.0, + "step": 4928 + }, + { + "epoch": 6.725123489059349, + "grad_norm": 0.2213222465597084, + "learning_rate": 3.176117530043758e-06, + "loss": 0.1831, + "num_tokens": 3759670298.0, + "step": 4929 + }, + { + "epoch": 6.7264890700180295, + "grad_norm": 0.21874447220926785, + "learning_rate": 3.173661872867973e-06, + "loss": 0.1773, + "num_tokens": 3760409195.0, + "step": 4930 + }, + { + "epoch": 6.7278546509767105, + "grad_norm": 0.21817156089574227, + "learning_rate": 3.1712086031690163e-06, + "loss": 0.1873, + "num_tokens": 3761175529.0, + "step": 4931 + }, + { + "epoch": 6.729220231935391, + "grad_norm": 0.22791579036172707, + "learning_rate": 3.1687577216952835e-06, + "loss": 0.1838, + "num_tokens": 3761901582.0, + "step": 4932 + }, + { + "epoch": 6.730585812894072, + "grad_norm": 0.22281414660799048, + "learning_rate": 3.166309229194426e-06, + "loss": 0.1819, + "num_tokens": 3762676076.0, + "step": 4933 + }, + { + "epoch": 6.731951393852752, + "grad_norm": 0.21982239791278763, + "learning_rate": 3.1638631264133734e-06, + "loss": 0.1787, + "num_tokens": 3763498810.0, + "step": 4934 + }, + { + "epoch": 6.733316974811433, + "grad_norm": 0.21683202795304135, + "learning_rate": 3.161419414098327e-06, + "loss": 0.1785, + "num_tokens": 3764265053.0, + "step": 4935 + }, + { + "epoch": 6.734682555770113, + "grad_norm": 0.21939550899078603, + "learning_rate": 3.1589780929947583e-06, + "loss": 0.1889, + "num_tokens": 3765071239.0, + "step": 4936 + }, + { + "epoch": 6.736048136728794, + "grad_norm": 0.22203717973198125, + "learning_rate": 3.156539163847407e-06, + "loss": 0.19, + "num_tokens": 3765904500.0, + "step": 4937 + }, + { + "epoch": 6.737413717687474, + "grad_norm": 0.2187174355768729, + "learning_rate": 3.1541026274002844e-06, + "loss": 0.1808, + "num_tokens": 3766668435.0, + "step": 4938 + }, + { + "epoch": 6.738779298646154, + "grad_norm": 0.22443028277913843, + "learning_rate": 3.151668484396677e-06, + "loss": 0.1692, + "num_tokens": 3767442022.0, + "step": 4939 + }, + { + "epoch": 6.740144879604835, + "grad_norm": 0.2199391990095458, + "learning_rate": 3.1492367355791264e-06, + "loss": 0.1834, + "num_tokens": 3768215505.0, + "step": 4940 + }, + { + "epoch": 6.741510460563515, + "grad_norm": 0.2237636313421162, + "learning_rate": 3.146807381689465e-06, + "loss": 0.1846, + "num_tokens": 3768892235.0, + "step": 4941 + }, + { + "epoch": 6.742876041522196, + "grad_norm": 0.22178800174679641, + "learning_rate": 3.1443804234687774e-06, + "loss": 0.1809, + "num_tokens": 3769776220.0, + "step": 4942 + }, + { + "epoch": 6.744241622480876, + "grad_norm": 0.1989079486803811, + "learning_rate": 3.1419558616574242e-06, + "loss": 0.1788, + "num_tokens": 3770488861.0, + "step": 4943 + }, + { + "epoch": 6.745607203439557, + "grad_norm": 0.24462345780391534, + "learning_rate": 3.139533696995036e-06, + "loss": 0.1712, + "num_tokens": 3771198472.0, + "step": 4944 + }, + { + "epoch": 6.746972784398237, + "grad_norm": 0.22162654198835027, + "learning_rate": 3.1371139302205067e-06, + "loss": 0.179, + "num_tokens": 3771961797.0, + "step": 4945 + }, + { + "epoch": 6.7483383653569184, + "grad_norm": 0.23222107348032206, + "learning_rate": 3.134696562072006e-06, + "loss": 0.1849, + "num_tokens": 3772739107.0, + "step": 4946 + }, + { + "epoch": 6.749703946315599, + "grad_norm": 0.21718864463616844, + "learning_rate": 3.1322815932869664e-06, + "loss": 0.18, + "num_tokens": 3773502271.0, + "step": 4947 + }, + { + "epoch": 6.751069527274279, + "grad_norm": 0.2243174822803025, + "learning_rate": 3.1298690246020902e-06, + "loss": 0.1836, + "num_tokens": 3774242645.0, + "step": 4948 + }, + { + "epoch": 6.75243510823296, + "grad_norm": 0.23053248855352537, + "learning_rate": 3.1274588567533494e-06, + "loss": 0.1915, + "num_tokens": 3775086458.0, + "step": 4949 + }, + { + "epoch": 6.75380068919164, + "grad_norm": 0.2169633570069198, + "learning_rate": 3.125051090475981e-06, + "loss": 0.1775, + "num_tokens": 3775843821.0, + "step": 4950 + }, + { + "epoch": 6.755166270150321, + "grad_norm": 0.21776206216142294, + "learning_rate": 3.1226457265044886e-06, + "loss": 0.1803, + "num_tokens": 3776655378.0, + "step": 4951 + }, + { + "epoch": 6.756531851109001, + "grad_norm": 0.21075031440922903, + "learning_rate": 3.120242765572646e-06, + "loss": 0.1787, + "num_tokens": 3777461879.0, + "step": 4952 + }, + { + "epoch": 6.757897432067682, + "grad_norm": 0.2155785538483473, + "learning_rate": 3.1178422084134946e-06, + "loss": 0.1831, + "num_tokens": 3778223092.0, + "step": 4953 + }, + { + "epoch": 6.759263013026362, + "grad_norm": 0.22179712040466867, + "learning_rate": 3.1154440557593363e-06, + "loss": 0.1861, + "num_tokens": 3778985790.0, + "step": 4954 + }, + { + "epoch": 6.760628593985043, + "grad_norm": 0.21236701666463942, + "learning_rate": 3.1130483083417443e-06, + "loss": 0.176, + "num_tokens": 3779709374.0, + "step": 4955 + }, + { + "epoch": 6.761994174943723, + "grad_norm": 0.2200353388768226, + "learning_rate": 3.1106549668915594e-06, + "loss": 0.1851, + "num_tokens": 3780483118.0, + "step": 4956 + }, + { + "epoch": 6.763359755902403, + "grad_norm": 0.21974595108029168, + "learning_rate": 3.108264032138886e-06, + "loss": 0.1767, + "num_tokens": 3781283858.0, + "step": 4957 + }, + { + "epoch": 6.764725336861084, + "grad_norm": 0.2270829474645771, + "learning_rate": 3.1058755048130935e-06, + "loss": 0.183, + "num_tokens": 3782077737.0, + "step": 4958 + }, + { + "epoch": 6.766090917819764, + "grad_norm": 0.2162055716243948, + "learning_rate": 3.1034893856428213e-06, + "loss": 0.1737, + "num_tokens": 3782772764.0, + "step": 4959 + }, + { + "epoch": 6.767456498778445, + "grad_norm": 0.22114579383999353, + "learning_rate": 3.101105675355967e-06, + "loss": 0.1775, + "num_tokens": 3783554966.0, + "step": 4960 + }, + { + "epoch": 6.7688220797371255, + "grad_norm": 0.20968240633467986, + "learning_rate": 3.0987243746796997e-06, + "loss": 0.1826, + "num_tokens": 3784310842.0, + "step": 4961 + }, + { + "epoch": 6.7701876606958065, + "grad_norm": 0.22926899084843713, + "learning_rate": 3.096345484340451e-06, + "loss": 0.1781, + "num_tokens": 3785048854.0, + "step": 4962 + }, + { + "epoch": 6.771553241654487, + "grad_norm": 0.2286487878994961, + "learning_rate": 3.0939690050639155e-06, + "loss": 0.1828, + "num_tokens": 3785824720.0, + "step": 4963 + }, + { + "epoch": 6.772918822613168, + "grad_norm": 0.21039169942129382, + "learning_rate": 3.0915949375750565e-06, + "loss": 0.1829, + "num_tokens": 3786591500.0, + "step": 4964 + }, + { + "epoch": 6.774284403571848, + "grad_norm": 0.22042908699422537, + "learning_rate": 3.089223282598097e-06, + "loss": 0.1889, + "num_tokens": 3787397898.0, + "step": 4965 + }, + { + "epoch": 6.775649984530528, + "grad_norm": 0.2226793496481801, + "learning_rate": 3.086854040856528e-06, + "loss": 0.1794, + "num_tokens": 3788116540.0, + "step": 4966 + }, + { + "epoch": 6.777015565489209, + "grad_norm": 0.2361497052677204, + "learning_rate": 3.0844872130731023e-06, + "loss": 0.1782, + "num_tokens": 3788856192.0, + "step": 4967 + }, + { + "epoch": 6.778381146447889, + "grad_norm": 0.21476224475294475, + "learning_rate": 3.082122799969836e-06, + "loss": 0.1833, + "num_tokens": 3789642109.0, + "step": 4968 + }, + { + "epoch": 6.77974672740657, + "grad_norm": 0.2174007059624105, + "learning_rate": 3.0797608022680085e-06, + "loss": 0.1843, + "num_tokens": 3790393148.0, + "step": 4969 + }, + { + "epoch": 6.78111230836525, + "grad_norm": 0.22851487059765943, + "learning_rate": 3.077401220688165e-06, + "loss": 0.1775, + "num_tokens": 3791179744.0, + "step": 4970 + }, + { + "epoch": 6.782477889323931, + "grad_norm": 0.21604775845608645, + "learning_rate": 3.0750440559501105e-06, + "loss": 0.1849, + "num_tokens": 3791989393.0, + "step": 4971 + }, + { + "epoch": 6.783843470282611, + "grad_norm": 0.24046680700351344, + "learning_rate": 3.0726893087729138e-06, + "loss": 0.1681, + "num_tokens": 3792755321.0, + "step": 4972 + }, + { + "epoch": 6.785209051241292, + "grad_norm": 0.21178439571384558, + "learning_rate": 3.070336979874905e-06, + "loss": 0.1858, + "num_tokens": 3793559913.0, + "step": 4973 + }, + { + "epoch": 6.786574632199972, + "grad_norm": 0.2235378305521276, + "learning_rate": 3.06798706997368e-06, + "loss": 0.1771, + "num_tokens": 3794307912.0, + "step": 4974 + }, + { + "epoch": 6.787940213158652, + "grad_norm": 0.2297757994312317, + "learning_rate": 3.0656395797860928e-06, + "loss": 0.1726, + "num_tokens": 3795024514.0, + "step": 4975 + }, + { + "epoch": 6.789305794117333, + "grad_norm": 0.22563998949438702, + "learning_rate": 3.063294510028265e-06, + "loss": 0.1763, + "num_tokens": 3795813635.0, + "step": 4976 + }, + { + "epoch": 6.7906713750760135, + "grad_norm": 0.22144764880471945, + "learning_rate": 3.0609518614155694e-06, + "loss": 0.1801, + "num_tokens": 3796530108.0, + "step": 4977 + }, + { + "epoch": 6.7920369560346945, + "grad_norm": 0.22321842473062675, + "learning_rate": 3.0586116346626503e-06, + "loss": 0.1692, + "num_tokens": 3797198179.0, + "step": 4978 + }, + { + "epoch": 6.793402536993375, + "grad_norm": 0.23164729921087102, + "learning_rate": 3.0562738304834106e-06, + "loss": 0.1742, + "num_tokens": 3797902352.0, + "step": 4979 + }, + { + "epoch": 6.794768117952056, + "grad_norm": 0.224752654644207, + "learning_rate": 3.0539384495910123e-06, + "loss": 0.1862, + "num_tokens": 3798656909.0, + "step": 4980 + }, + { + "epoch": 6.796133698910736, + "grad_norm": 0.22511060133799465, + "learning_rate": 3.0516054926978744e-06, + "loss": 0.1785, + "num_tokens": 3799396961.0, + "step": 4981 + }, + { + "epoch": 6.797499279869417, + "grad_norm": 0.22783253984652851, + "learning_rate": 3.049274960515689e-06, + "loss": 0.188, + "num_tokens": 3800264390.0, + "step": 4982 + }, + { + "epoch": 6.798864860828097, + "grad_norm": 0.21786044036412722, + "learning_rate": 3.046946853755397e-06, + "loss": 0.1777, + "num_tokens": 3801063181.0, + "step": 4983 + }, + { + "epoch": 6.800230441786777, + "grad_norm": 0.2199127163170723, + "learning_rate": 3.0446211731271977e-06, + "loss": 0.1746, + "num_tokens": 3801811720.0, + "step": 4984 + }, + { + "epoch": 6.801596022745458, + "grad_norm": 0.2140209848323263, + "learning_rate": 3.0422979193405643e-06, + "loss": 0.1877, + "num_tokens": 3802586615.0, + "step": 4985 + }, + { + "epoch": 6.802961603704138, + "grad_norm": 0.23015295801529426, + "learning_rate": 3.039977093104216e-06, + "loss": 0.1811, + "num_tokens": 3803334416.0, + "step": 4986 + }, + { + "epoch": 6.804327184662819, + "grad_norm": 0.2140254974280941, + "learning_rate": 3.0376586951261367e-06, + "loss": 0.1893, + "num_tokens": 3804120562.0, + "step": 4987 + }, + { + "epoch": 6.805692765621499, + "grad_norm": 0.23344861733081992, + "learning_rate": 3.0353427261135703e-06, + "loss": 0.1838, + "num_tokens": 3804907449.0, + "step": 4988 + }, + { + "epoch": 6.80705834658018, + "grad_norm": 0.22991190181386892, + "learning_rate": 3.03302918677302e-06, + "loss": 0.176, + "num_tokens": 3805605039.0, + "step": 4989 + }, + { + "epoch": 6.80842392753886, + "grad_norm": 0.22399608139878693, + "learning_rate": 3.030718077810242e-06, + "loss": 0.1851, + "num_tokens": 3806454566.0, + "step": 4990 + }, + { + "epoch": 6.809789508497541, + "grad_norm": 0.22262154806888382, + "learning_rate": 3.028409399930258e-06, + "loss": 0.1792, + "num_tokens": 3807283743.0, + "step": 4991 + }, + { + "epoch": 6.811155089456221, + "grad_norm": 0.2129750839184565, + "learning_rate": 3.026103153837348e-06, + "loss": 0.1888, + "num_tokens": 3808003476.0, + "step": 4992 + }, + { + "epoch": 6.8125206704149015, + "grad_norm": 0.23108406978154383, + "learning_rate": 3.023799340235043e-06, + "loss": 0.1759, + "num_tokens": 3808761311.0, + "step": 4993 + }, + { + "epoch": 6.8138862513735825, + "grad_norm": 0.2209551317069388, + "learning_rate": 3.021497959826142e-06, + "loss": 0.1857, + "num_tokens": 3809601840.0, + "step": 4994 + }, + { + "epoch": 6.815251832332263, + "grad_norm": 0.211713712406043, + "learning_rate": 3.019199013312693e-06, + "loss": 0.1712, + "num_tokens": 3810296487.0, + "step": 4995 + }, + { + "epoch": 6.816617413290944, + "grad_norm": 0.23456398117019026, + "learning_rate": 3.016902501396005e-06, + "loss": 0.1819, + "num_tokens": 3811048864.0, + "step": 4996 + }, + { + "epoch": 6.817982994249624, + "grad_norm": 0.23285948591489183, + "learning_rate": 3.014608424776646e-06, + "loss": 0.1746, + "num_tokens": 3811750342.0, + "step": 4997 + }, + { + "epoch": 6.819348575208305, + "grad_norm": 0.22861259603637232, + "learning_rate": 3.0123167841544398e-06, + "loss": 0.1842, + "num_tokens": 3812495918.0, + "step": 4998 + }, + { + "epoch": 6.820714156166985, + "grad_norm": 0.23569376396036495, + "learning_rate": 3.0100275802284643e-06, + "loss": 0.1833, + "num_tokens": 3813294193.0, + "step": 4999 + }, + { + "epoch": 6.822079737125666, + "grad_norm": 0.21652172589337032, + "learning_rate": 3.0077408136970574e-06, + "loss": 0.1826, + "num_tokens": 3814097527.0, + "step": 5000 + }, + { + "epoch": 6.823445318084346, + "grad_norm": 0.21978716988253072, + "learning_rate": 3.0054564852578155e-06, + "loss": 0.1823, + "num_tokens": 3814895511.0, + "step": 5001 + }, + { + "epoch": 6.824810899043026, + "grad_norm": 0.23187877112705985, + "learning_rate": 3.0031745956075807e-06, + "loss": 0.1886, + "num_tokens": 3815668973.0, + "step": 5002 + }, + { + "epoch": 6.826176480001707, + "grad_norm": 0.2637105121727647, + "learning_rate": 3.000895145442469e-06, + "loss": 0.1873, + "num_tokens": 3816435258.0, + "step": 5003 + }, + { + "epoch": 6.827542060960387, + "grad_norm": 0.247809750415901, + "learning_rate": 2.998618135457833e-06, + "loss": 0.1856, + "num_tokens": 3817205954.0, + "step": 5004 + }, + { + "epoch": 6.828907641919068, + "grad_norm": 0.21028389035975867, + "learning_rate": 2.9963435663482956e-06, + "loss": 0.1836, + "num_tokens": 3817991659.0, + "step": 5005 + }, + { + "epoch": 6.830273222877748, + "grad_norm": 0.2206729542065838, + "learning_rate": 2.994071438807727e-06, + "loss": 0.1846, + "num_tokens": 3818833465.0, + "step": 5006 + }, + { + "epoch": 6.831638803836429, + "grad_norm": 0.2217622087322051, + "learning_rate": 2.9918017535292548e-06, + "loss": 0.1863, + "num_tokens": 3819547114.0, + "step": 5007 + }, + { + "epoch": 6.833004384795109, + "grad_norm": 0.22381997029858292, + "learning_rate": 2.9895345112052613e-06, + "loss": 0.171, + "num_tokens": 3820328263.0, + "step": 5008 + }, + { + "epoch": 6.83436996575379, + "grad_norm": 0.21878891162465394, + "learning_rate": 2.9872697125273847e-06, + "loss": 0.1775, + "num_tokens": 3821154686.0, + "step": 5009 + }, + { + "epoch": 6.8357355467124705, + "grad_norm": 0.2337429310284699, + "learning_rate": 2.9850073581865184e-06, + "loss": 0.1882, + "num_tokens": 3821893728.0, + "step": 5010 + }, + { + "epoch": 6.837101127671151, + "grad_norm": 0.2240471414109968, + "learning_rate": 2.9827474488728044e-06, + "loss": 0.1801, + "num_tokens": 3822652488.0, + "step": 5011 + }, + { + "epoch": 6.838466708629832, + "grad_norm": 0.21453685021457702, + "learning_rate": 2.980489985275649e-06, + "loss": 0.1819, + "num_tokens": 3823479741.0, + "step": 5012 + }, + { + "epoch": 6.839832289588512, + "grad_norm": 0.22016860134144678, + "learning_rate": 2.9782349680837024e-06, + "loss": 0.1765, + "num_tokens": 3824186577.0, + "step": 5013 + }, + { + "epoch": 6.841197870547193, + "grad_norm": 0.2322667347510334, + "learning_rate": 2.975982397984874e-06, + "loss": 0.1801, + "num_tokens": 3824975533.0, + "step": 5014 + }, + { + "epoch": 6.842563451505873, + "grad_norm": 0.2089813505495261, + "learning_rate": 2.973732275666327e-06, + "loss": 0.1892, + "num_tokens": 3825818876.0, + "step": 5015 + }, + { + "epoch": 6.843929032464554, + "grad_norm": 0.2248210381635068, + "learning_rate": 2.971484601814473e-06, + "loss": 0.1808, + "num_tokens": 3826577474.0, + "step": 5016 + }, + { + "epoch": 6.845294613423234, + "grad_norm": 0.21379923023655595, + "learning_rate": 2.969239377114982e-06, + "loss": 0.1838, + "num_tokens": 3827338494.0, + "step": 5017 + }, + { + "epoch": 6.846660194381915, + "grad_norm": 0.22644137501887937, + "learning_rate": 2.966996602252773e-06, + "loss": 0.1779, + "num_tokens": 3828093501.0, + "step": 5018 + }, + { + "epoch": 6.848025775340595, + "grad_norm": 0.22247120247893012, + "learning_rate": 2.9647562779120233e-06, + "loss": 0.1688, + "num_tokens": 3828802521.0, + "step": 5019 + }, + { + "epoch": 6.849391356299275, + "grad_norm": 0.22164292679484485, + "learning_rate": 2.9625184047761522e-06, + "loss": 0.1852, + "num_tokens": 3829577322.0, + "step": 5020 + }, + { + "epoch": 6.850756937257956, + "grad_norm": 0.22003024353340714, + "learning_rate": 2.9602829835278464e-06, + "loss": 0.1844, + "num_tokens": 3830378979.0, + "step": 5021 + }, + { + "epoch": 6.852122518216636, + "grad_norm": 0.210584562209536, + "learning_rate": 2.9580500148490297e-06, + "loss": 0.1814, + "num_tokens": 3831153186.0, + "step": 5022 + }, + { + "epoch": 6.853488099175317, + "grad_norm": 0.22598251301136388, + "learning_rate": 2.955819499420887e-06, + "loss": 0.1823, + "num_tokens": 3831932008.0, + "step": 5023 + }, + { + "epoch": 6.854853680133997, + "grad_norm": 0.2210176135938933, + "learning_rate": 2.9535914379238518e-06, + "loss": 0.1831, + "num_tokens": 3832698183.0, + "step": 5024 + }, + { + "epoch": 6.856219261092678, + "grad_norm": 0.23763352093764192, + "learning_rate": 2.9513658310376075e-06, + "loss": 0.1733, + "num_tokens": 3833439499.0, + "step": 5025 + }, + { + "epoch": 6.8575848420513585, + "grad_norm": 0.3322242820485917, + "learning_rate": 2.9491426794410918e-06, + "loss": 0.1811, + "num_tokens": 3834150219.0, + "step": 5026 + }, + { + "epoch": 6.8589504230100395, + "grad_norm": 0.23215579782512066, + "learning_rate": 2.94692198381249e-06, + "loss": 0.1743, + "num_tokens": 3834860406.0, + "step": 5027 + }, + { + "epoch": 6.86031600396872, + "grad_norm": 0.2334142971782333, + "learning_rate": 2.944703744829245e-06, + "loss": 0.1845, + "num_tokens": 3835644412.0, + "step": 5028 + }, + { + "epoch": 6.8616815849274, + "grad_norm": 0.2205079991863466, + "learning_rate": 2.9424879631680397e-06, + "loss": 0.1817, + "num_tokens": 3836365438.0, + "step": 5029 + }, + { + "epoch": 6.863047165886081, + "grad_norm": 0.22528064964760064, + "learning_rate": 2.9402746395048164e-06, + "loss": 0.1817, + "num_tokens": 3837117155.0, + "step": 5030 + }, + { + "epoch": 6.864412746844761, + "grad_norm": 0.22848848707834538, + "learning_rate": 2.9380637745147622e-06, + "loss": 0.1737, + "num_tokens": 3837823810.0, + "step": 5031 + }, + { + "epoch": 6.865778327803442, + "grad_norm": 0.24648195688209285, + "learning_rate": 2.935855368872319e-06, + "loss": 0.185, + "num_tokens": 3838534500.0, + "step": 5032 + }, + { + "epoch": 6.867143908762122, + "grad_norm": 0.21608876711878142, + "learning_rate": 2.9336494232511757e-06, + "loss": 0.1819, + "num_tokens": 3839369779.0, + "step": 5033 + }, + { + "epoch": 6.868509489720803, + "grad_norm": 0.2238790039316482, + "learning_rate": 2.931445938324268e-06, + "loss": 0.1888, + "num_tokens": 3840143420.0, + "step": 5034 + }, + { + "epoch": 6.869875070679483, + "grad_norm": 0.2223878390643004, + "learning_rate": 2.9292449147637848e-06, + "loss": 0.1812, + "num_tokens": 3840932082.0, + "step": 5035 + }, + { + "epoch": 6.871240651638164, + "grad_norm": 0.21252344215450844, + "learning_rate": 2.9270463532411634e-06, + "loss": 0.1841, + "num_tokens": 3841694212.0, + "step": 5036 + }, + { + "epoch": 6.872606232596844, + "grad_norm": 0.2207917860331854, + "learning_rate": 2.9248502544270896e-06, + "loss": 0.1844, + "num_tokens": 3842486151.0, + "step": 5037 + }, + { + "epoch": 6.873971813555524, + "grad_norm": 0.21987312271577825, + "learning_rate": 2.9226566189914995e-06, + "loss": 0.1851, + "num_tokens": 3843261714.0, + "step": 5038 + }, + { + "epoch": 6.875337394514205, + "grad_norm": 0.24163495787795777, + "learning_rate": 2.9204654476035744e-06, + "loss": 0.1831, + "num_tokens": 3844006127.0, + "step": 5039 + }, + { + "epoch": 6.8767029754728854, + "grad_norm": 0.21402363101809663, + "learning_rate": 2.918276740931746e-06, + "loss": 0.1882, + "num_tokens": 3844796036.0, + "step": 5040 + }, + { + "epoch": 6.8780685564315664, + "grad_norm": 0.22799698163600932, + "learning_rate": 2.9160904996436932e-06, + "loss": 0.187, + "num_tokens": 3845533214.0, + "step": 5041 + }, + { + "epoch": 6.879434137390247, + "grad_norm": 0.20904555530852242, + "learning_rate": 2.9139067244063467e-06, + "loss": 0.181, + "num_tokens": 3846285358.0, + "step": 5042 + }, + { + "epoch": 6.880799718348928, + "grad_norm": 0.22308078533520775, + "learning_rate": 2.9117254158858787e-06, + "loss": 0.1769, + "num_tokens": 3847041612.0, + "step": 5043 + }, + { + "epoch": 6.882165299307608, + "grad_norm": 0.23307612801622327, + "learning_rate": 2.9095465747477133e-06, + "loss": 0.1799, + "num_tokens": 3847812082.0, + "step": 5044 + }, + { + "epoch": 6.883530880266289, + "grad_norm": 0.22332650811332444, + "learning_rate": 2.907370201656521e-06, + "loss": 0.1801, + "num_tokens": 3848620062.0, + "step": 5045 + }, + { + "epoch": 6.884896461224969, + "grad_norm": 0.21949665970037205, + "learning_rate": 2.905196297276215e-06, + "loss": 0.183, + "num_tokens": 3849348078.0, + "step": 5046 + }, + { + "epoch": 6.886262042183649, + "grad_norm": 0.235619881015995, + "learning_rate": 2.9030248622699657e-06, + "loss": 0.1781, + "num_tokens": 3850105462.0, + "step": 5047 + }, + { + "epoch": 6.88762762314233, + "grad_norm": 0.23421292425732562, + "learning_rate": 2.9008558973001788e-06, + "loss": 0.1881, + "num_tokens": 3850848302.0, + "step": 5048 + }, + { + "epoch": 6.88899320410101, + "grad_norm": 0.23556033867283066, + "learning_rate": 2.8986894030285146e-06, + "loss": 0.1715, + "num_tokens": 3851549222.0, + "step": 5049 + }, + { + "epoch": 6.890358785059691, + "grad_norm": 0.23169821967740284, + "learning_rate": 2.8965253801158744e-06, + "loss": 0.1837, + "num_tokens": 3852279486.0, + "step": 5050 + }, + { + "epoch": 6.891724366018371, + "grad_norm": 0.22379965198154578, + "learning_rate": 2.8943638292224115e-06, + "loss": 0.1786, + "num_tokens": 3853023235.0, + "step": 5051 + }, + { + "epoch": 6.893089946977052, + "grad_norm": 0.22883185278047585, + "learning_rate": 2.8922047510075174e-06, + "loss": 0.1822, + "num_tokens": 3853804095.0, + "step": 5052 + }, + { + "epoch": 6.894455527935732, + "grad_norm": 0.233874088290576, + "learning_rate": 2.8900481461298347e-06, + "loss": 0.1773, + "num_tokens": 3854634477.0, + "step": 5053 + }, + { + "epoch": 6.895821108894413, + "grad_norm": 0.2065778106233143, + "learning_rate": 2.8878940152472514e-06, + "loss": 0.1855, + "num_tokens": 3855461523.0, + "step": 5054 + }, + { + "epoch": 6.897186689853093, + "grad_norm": 0.21396776329575898, + "learning_rate": 2.8857423590168964e-06, + "loss": 0.1777, + "num_tokens": 3856284734.0, + "step": 5055 + }, + { + "epoch": 6.8985522708117735, + "grad_norm": 0.21409820278479227, + "learning_rate": 2.8835931780951525e-06, + "loss": 0.1731, + "num_tokens": 3857004904.0, + "step": 5056 + }, + { + "epoch": 6.8999178517704545, + "grad_norm": 0.2197813123863557, + "learning_rate": 2.8814464731376356e-06, + "loss": 0.1802, + "num_tokens": 3857767665.0, + "step": 5057 + }, + { + "epoch": 6.901283432729135, + "grad_norm": 0.23309901866232938, + "learning_rate": 2.879302244799216e-06, + "loss": 0.18, + "num_tokens": 3858507107.0, + "step": 5058 + }, + { + "epoch": 6.902649013687816, + "grad_norm": 0.22967108188677, + "learning_rate": 2.877160493734003e-06, + "loss": 0.1807, + "num_tokens": 3859253769.0, + "step": 5059 + }, + { + "epoch": 6.904014594646496, + "grad_norm": 0.2221628989719488, + "learning_rate": 2.8750212205953565e-06, + "loss": 0.1875, + "num_tokens": 3860080498.0, + "step": 5060 + }, + { + "epoch": 6.905380175605177, + "grad_norm": 0.2187223330935441, + "learning_rate": 2.872884426035869e-06, + "loss": 0.1718, + "num_tokens": 3860793528.0, + "step": 5061 + }, + { + "epoch": 6.906745756563857, + "grad_norm": 0.23117481758273262, + "learning_rate": 2.8707501107073887e-06, + "loss": 0.1873, + "num_tokens": 3861572081.0, + "step": 5062 + }, + { + "epoch": 6.908111337522538, + "grad_norm": 0.21605820723372357, + "learning_rate": 2.8686182752610025e-06, + "loss": 0.1789, + "num_tokens": 3862373481.0, + "step": 5063 + }, + { + "epoch": 6.909476918481218, + "grad_norm": 0.2257453827444246, + "learning_rate": 2.8664889203470388e-06, + "loss": 0.1806, + "num_tokens": 3863095340.0, + "step": 5064 + }, + { + "epoch": 6.910842499439898, + "grad_norm": 0.21660299515465173, + "learning_rate": 2.864362046615075e-06, + "loss": 0.1753, + "num_tokens": 3863842990.0, + "step": 5065 + }, + { + "epoch": 6.912208080398579, + "grad_norm": 0.2219463476254167, + "learning_rate": 2.8622376547139254e-06, + "loss": 0.1773, + "num_tokens": 3864608076.0, + "step": 5066 + }, + { + "epoch": 6.913573661357259, + "grad_norm": 0.21831453690105718, + "learning_rate": 2.86011574529165e-06, + "loss": 0.1793, + "num_tokens": 3865371467.0, + "step": 5067 + }, + { + "epoch": 6.91493924231594, + "grad_norm": 0.2327004508800191, + "learning_rate": 2.857996318995553e-06, + "loss": 0.1806, + "num_tokens": 3866195731.0, + "step": 5068 + }, + { + "epoch": 6.91630482327462, + "grad_norm": 0.21600267394288353, + "learning_rate": 2.855879376472179e-06, + "loss": 0.1827, + "num_tokens": 3866932539.0, + "step": 5069 + }, + { + "epoch": 6.917670404233301, + "grad_norm": 0.23135868543785096, + "learning_rate": 2.8537649183673155e-06, + "loss": 0.1812, + "num_tokens": 3867667610.0, + "step": 5070 + }, + { + "epoch": 6.919035985191981, + "grad_norm": 0.23750019712323014, + "learning_rate": 2.8516529453259913e-06, + "loss": 0.179, + "num_tokens": 3868387990.0, + "step": 5071 + }, + { + "epoch": 6.920401566150662, + "grad_norm": 0.2171056960038839, + "learning_rate": 2.8495434579924808e-06, + "loss": 0.1818, + "num_tokens": 3869145399.0, + "step": 5072 + }, + { + "epoch": 6.9217671471093425, + "grad_norm": 0.23651155901790955, + "learning_rate": 2.8474364570102907e-06, + "loss": 0.1837, + "num_tokens": 3870018970.0, + "step": 5073 + }, + { + "epoch": 6.923132728068023, + "grad_norm": 0.21003115865836064, + "learning_rate": 2.845331943022184e-06, + "loss": 0.1843, + "num_tokens": 3870775909.0, + "step": 5074 + }, + { + "epoch": 6.924498309026704, + "grad_norm": 0.22441250525690054, + "learning_rate": 2.843229916670151e-06, + "loss": 0.1833, + "num_tokens": 3871577356.0, + "step": 5075 + }, + { + "epoch": 6.925863889985384, + "grad_norm": 0.22033218520218506, + "learning_rate": 2.8411303785954304e-06, + "loss": 0.1828, + "num_tokens": 3872360216.0, + "step": 5076 + }, + { + "epoch": 6.927229470944065, + "grad_norm": 0.23618449284724385, + "learning_rate": 2.839033329438503e-06, + "loss": 0.1689, + "num_tokens": 3873143448.0, + "step": 5077 + }, + { + "epoch": 6.928595051902745, + "grad_norm": 0.20315508477265815, + "learning_rate": 2.8369387698390838e-06, + "loss": 0.1832, + "num_tokens": 3873878392.0, + "step": 5078 + }, + { + "epoch": 6.929960632861426, + "grad_norm": 0.23369438880919305, + "learning_rate": 2.834846700436133e-06, + "loss": 0.178, + "num_tokens": 3874635322.0, + "step": 5079 + }, + { + "epoch": 6.931326213820106, + "grad_norm": 0.21570767393180865, + "learning_rate": 2.8327571218678517e-06, + "loss": 0.1832, + "num_tokens": 3875413840.0, + "step": 5080 + }, + { + "epoch": 6.932691794778787, + "grad_norm": 0.22902416787061503, + "learning_rate": 2.8306700347716803e-06, + "loss": 0.1881, + "num_tokens": 3876167627.0, + "step": 5081 + }, + { + "epoch": 6.934057375737467, + "grad_norm": 0.23051647524816557, + "learning_rate": 2.828585439784295e-06, + "loss": 0.1802, + "num_tokens": 3876937321.0, + "step": 5082 + }, + { + "epoch": 6.935422956696147, + "grad_norm": 0.21701848500426432, + "learning_rate": 2.8265033375416213e-06, + "loss": 0.1879, + "num_tokens": 3877754464.0, + "step": 5083 + }, + { + "epoch": 6.936788537654828, + "grad_norm": 0.21241519731207809, + "learning_rate": 2.8244237286788157e-06, + "loss": 0.1736, + "num_tokens": 3878573801.0, + "step": 5084 + }, + { + "epoch": 6.938154118613508, + "grad_norm": 0.21308531017887677, + "learning_rate": 2.8223466138302723e-06, + "loss": 0.1809, + "num_tokens": 3879322391.0, + "step": 5085 + }, + { + "epoch": 6.939519699572189, + "grad_norm": 0.22084227910686982, + "learning_rate": 2.8202719936296367e-06, + "loss": 0.1789, + "num_tokens": 3880078095.0, + "step": 5086 + }, + { + "epoch": 6.940885280530869, + "grad_norm": 0.23288934559585775, + "learning_rate": 2.81819986870978e-06, + "loss": 0.1836, + "num_tokens": 3880792429.0, + "step": 5087 + }, + { + "epoch": 6.94225086148955, + "grad_norm": 0.22887212006856153, + "learning_rate": 2.816130239702821e-06, + "loss": 0.178, + "num_tokens": 3881590359.0, + "step": 5088 + }, + { + "epoch": 6.9436164424482305, + "grad_norm": 0.2058289350309943, + "learning_rate": 2.814063107240111e-06, + "loss": 0.1796, + "num_tokens": 3882303463.0, + "step": 5089 + }, + { + "epoch": 6.9449820234069115, + "grad_norm": 0.23772961699492431, + "learning_rate": 2.811998471952247e-06, + "loss": 0.1825, + "num_tokens": 3883126732.0, + "step": 5090 + }, + { + "epoch": 6.946347604365592, + "grad_norm": 0.2197494738323331, + "learning_rate": 2.809936334469053e-06, + "loss": 0.1753, + "num_tokens": 3883865117.0, + "step": 5091 + }, + { + "epoch": 6.947713185324272, + "grad_norm": 0.2228480783327344, + "learning_rate": 2.8078766954196046e-06, + "loss": 0.1848, + "num_tokens": 3884616440.0, + "step": 5092 + }, + { + "epoch": 6.949078766282953, + "grad_norm": 0.22908140027406343, + "learning_rate": 2.805819555432206e-06, + "loss": 0.1832, + "num_tokens": 3885409433.0, + "step": 5093 + }, + { + "epoch": 6.950444347241633, + "grad_norm": 0.21813101056012943, + "learning_rate": 2.8037649151343988e-06, + "loss": 0.1879, + "num_tokens": 3886130430.0, + "step": 5094 + }, + { + "epoch": 6.951809928200314, + "grad_norm": 0.22186585251689542, + "learning_rate": 2.8017127751529704e-06, + "loss": 0.1758, + "num_tokens": 3886959051.0, + "step": 5095 + }, + { + "epoch": 6.953175509158994, + "grad_norm": 0.21222491317254105, + "learning_rate": 2.799663136113935e-06, + "loss": 0.1813, + "num_tokens": 3887738326.0, + "step": 5096 + }, + { + "epoch": 6.954541090117675, + "grad_norm": 0.2183823079326851, + "learning_rate": 2.7976159986425494e-06, + "loss": 0.1887, + "num_tokens": 3888520295.0, + "step": 5097 + }, + { + "epoch": 6.955906671076355, + "grad_norm": 0.22187448787630398, + "learning_rate": 2.7955713633633085e-06, + "loss": 0.1835, + "num_tokens": 3889234019.0, + "step": 5098 + }, + { + "epoch": 6.957272252035036, + "grad_norm": 0.2313389338024401, + "learning_rate": 2.7935292308999424e-06, + "loss": 0.1879, + "num_tokens": 3890025719.0, + "step": 5099 + }, + { + "epoch": 6.958637832993716, + "grad_norm": 0.24158249824822886, + "learning_rate": 2.7914896018754146e-06, + "loss": 0.1874, + "num_tokens": 3890729564.0, + "step": 5100 + }, + { + "epoch": 6.960003413952396, + "grad_norm": 0.2190861235122049, + "learning_rate": 2.789452476911928e-06, + "loss": 0.1802, + "num_tokens": 3891490208.0, + "step": 5101 + }, + { + "epoch": 6.961368994911077, + "grad_norm": 0.22345579427645199, + "learning_rate": 2.787417856630923e-06, + "loss": 0.1842, + "num_tokens": 3892264007.0, + "step": 5102 + }, + { + "epoch": 6.962734575869757, + "grad_norm": 0.2856814162618208, + "learning_rate": 2.785385741653073e-06, + "loss": 0.1843, + "num_tokens": 3893051470.0, + "step": 5103 + }, + { + "epoch": 6.964100156828438, + "grad_norm": 0.2273026566033108, + "learning_rate": 2.7833561325982894e-06, + "loss": 0.1835, + "num_tokens": 3893775920.0, + "step": 5104 + }, + { + "epoch": 6.9654657377871185, + "grad_norm": 0.22650029831917734, + "learning_rate": 2.7813290300857164e-06, + "loss": 0.1859, + "num_tokens": 3894556649.0, + "step": 5105 + }, + { + "epoch": 6.9668313187457995, + "grad_norm": 0.2378196381467257, + "learning_rate": 2.779304434733736e-06, + "loss": 0.1736, + "num_tokens": 3895252351.0, + "step": 5106 + }, + { + "epoch": 6.96819689970448, + "grad_norm": 0.21797302296221077, + "learning_rate": 2.777282347159964e-06, + "loss": 0.1817, + "num_tokens": 3895975972.0, + "step": 5107 + }, + { + "epoch": 6.969562480663161, + "grad_norm": 0.23187303730194023, + "learning_rate": 2.7752627679812543e-06, + "loss": 0.1842, + "num_tokens": 3896684622.0, + "step": 5108 + }, + { + "epoch": 6.970928061621841, + "grad_norm": 0.23632318406478975, + "learning_rate": 2.773245697813689e-06, + "loss": 0.1739, + "num_tokens": 3897492108.0, + "step": 5109 + }, + { + "epoch": 6.972293642580521, + "grad_norm": 0.20450017326389747, + "learning_rate": 2.771231137272591e-06, + "loss": 0.1798, + "num_tokens": 3898236031.0, + "step": 5110 + }, + { + "epoch": 6.973659223539202, + "grad_norm": 0.2201234673607879, + "learning_rate": 2.769219086972516e-06, + "loss": 0.1851, + "num_tokens": 3898994040.0, + "step": 5111 + }, + { + "epoch": 6.975024804497882, + "grad_norm": 0.21738290873865976, + "learning_rate": 2.767209547527252e-06, + "loss": 0.1782, + "num_tokens": 3899722854.0, + "step": 5112 + }, + { + "epoch": 6.976390385456563, + "grad_norm": 0.21824840069850218, + "learning_rate": 2.765202519549825e-06, + "loss": 0.1759, + "num_tokens": 3900373100.0, + "step": 5113 + }, + { + "epoch": 6.977755966415243, + "grad_norm": 0.23240688053025696, + "learning_rate": 2.763198003652489e-06, + "loss": 0.1807, + "num_tokens": 3901170777.0, + "step": 5114 + }, + { + "epoch": 6.979121547373924, + "grad_norm": 0.22020834455255098, + "learning_rate": 2.7611960004467364e-06, + "loss": 0.1853, + "num_tokens": 3901928346.0, + "step": 5115 + }, + { + "epoch": 6.980487128332604, + "grad_norm": 0.23243546115561042, + "learning_rate": 2.7591965105432933e-06, + "loss": 0.1801, + "num_tokens": 3902697700.0, + "step": 5116 + }, + { + "epoch": 6.981852709291285, + "grad_norm": 0.2215790161730926, + "learning_rate": 2.7571995345521145e-06, + "loss": 0.1711, + "num_tokens": 3903458551.0, + "step": 5117 + }, + { + "epoch": 6.983218290249965, + "grad_norm": 0.22535483227470726, + "learning_rate": 2.755205073082392e-06, + "loss": 0.181, + "num_tokens": 3904206584.0, + "step": 5118 + }, + { + "epoch": 6.984583871208645, + "grad_norm": 0.21830997021070098, + "learning_rate": 2.7532131267425503e-06, + "loss": 0.1838, + "num_tokens": 3904903156.0, + "step": 5119 + }, + { + "epoch": 6.985949452167326, + "grad_norm": 0.23426569233936553, + "learning_rate": 2.7512236961402466e-06, + "loss": 0.1814, + "num_tokens": 3905701816.0, + "step": 5120 + }, + { + "epoch": 6.9873150331260065, + "grad_norm": 0.2343640586194323, + "learning_rate": 2.749236781882367e-06, + "loss": 0.1824, + "num_tokens": 3906416443.0, + "step": 5121 + }, + { + "epoch": 6.9886806140846875, + "grad_norm": 0.2271115252775791, + "learning_rate": 2.747252384575038e-06, + "loss": 0.1759, + "num_tokens": 3907144273.0, + "step": 5122 + }, + { + "epoch": 6.990046195043368, + "grad_norm": 0.21297233081570185, + "learning_rate": 2.745270504823608e-06, + "loss": 0.18, + "num_tokens": 3907852900.0, + "step": 5123 + }, + { + "epoch": 6.991411776002049, + "grad_norm": 0.23681075559167175, + "learning_rate": 2.7432911432326666e-06, + "loss": 0.1802, + "num_tokens": 3908541402.0, + "step": 5124 + }, + { + "epoch": 6.992777356960729, + "grad_norm": 0.22863374880342954, + "learning_rate": 2.74131430040603e-06, + "loss": 0.1866, + "num_tokens": 3909305930.0, + "step": 5125 + }, + { + "epoch": 6.99414293791941, + "grad_norm": 0.2233878564490833, + "learning_rate": 2.7393399769467443e-06, + "loss": 0.1864, + "num_tokens": 3910040889.0, + "step": 5126 + }, + { + "epoch": 6.99550851887809, + "grad_norm": 0.232492852899945, + "learning_rate": 2.7373681734570966e-06, + "loss": 0.1818, + "num_tokens": 3910802752.0, + "step": 5127 + }, + { + "epoch": 6.99687409983677, + "grad_norm": 0.21165456543594816, + "learning_rate": 2.735398890538592e-06, + "loss": 0.1887, + "num_tokens": 3911606255.0, + "step": 5128 + }, + { + "epoch": 6.998239680795451, + "grad_norm": 0.22581012921291183, + "learning_rate": 2.733432128791978e-06, + "loss": 0.1802, + "num_tokens": 3912293608.0, + "step": 5129 + }, + { + "epoch": 6.999605261754131, + "grad_norm": 0.222777718904541, + "learning_rate": 2.7314678888172263e-06, + "loss": 0.1822, + "num_tokens": 3913065563.0, + "step": 5130 + }, + { + "epoch": 7.0, + "grad_norm": 0.42565635941891367, + "learning_rate": 2.7295061712135433e-06, + "loss": 0.1696, + "num_tokens": 3913294952.0, + "step": 5131 + }, + { + "epoch": 7.00136558095868, + "grad_norm": 0.33466972415482615, + "learning_rate": 2.7275469765793615e-06, + "loss": 0.1647, + "num_tokens": 3914074205.0, + "step": 5132 + }, + { + "epoch": 7.002731161917361, + "grad_norm": 0.3300775348088642, + "learning_rate": 2.7255903055123475e-06, + "loss": 0.1666, + "num_tokens": 3914793807.0, + "step": 5133 + }, + { + "epoch": 7.004096742876041, + "grad_norm": 0.28756238696483577, + "learning_rate": 2.7236361586093997e-06, + "loss": 0.1684, + "num_tokens": 3915502865.0, + "step": 5134 + }, + { + "epoch": 7.005462323834722, + "grad_norm": 0.24159740173362276, + "learning_rate": 2.7216845364666367e-06, + "loss": 0.1674, + "num_tokens": 3916305587.0, + "step": 5135 + }, + { + "epoch": 7.006827904793402, + "grad_norm": 0.23114485815741323, + "learning_rate": 2.719735439679421e-06, + "loss": 0.1631, + "num_tokens": 3917091738.0, + "step": 5136 + }, + { + "epoch": 7.008193485752083, + "grad_norm": 0.2517179578706104, + "learning_rate": 2.717788868842334e-06, + "loss": 0.1588, + "num_tokens": 3917824523.0, + "step": 5137 + }, + { + "epoch": 7.0095590667107635, + "grad_norm": 0.3038822668661947, + "learning_rate": 2.715844824549191e-06, + "loss": 0.1593, + "num_tokens": 3918580998.0, + "step": 5138 + }, + { + "epoch": 7.0109246476694445, + "grad_norm": 0.2744013130254852, + "learning_rate": 2.7139033073930375e-06, + "loss": 0.1597, + "num_tokens": 3919376389.0, + "step": 5139 + }, + { + "epoch": 7.012290228628125, + "grad_norm": 0.2665617631309145, + "learning_rate": 2.7119643179661432e-06, + "loss": 0.1647, + "num_tokens": 3920154551.0, + "step": 5140 + }, + { + "epoch": 7.013655809586805, + "grad_norm": 0.2681011073450579, + "learning_rate": 2.71002785686001e-06, + "loss": 0.1672, + "num_tokens": 3920890306.0, + "step": 5141 + }, + { + "epoch": 7.015021390545486, + "grad_norm": 0.25752076328335544, + "learning_rate": 2.70809392466537e-06, + "loss": 0.1636, + "num_tokens": 3921627925.0, + "step": 5142 + }, + { + "epoch": 7.016386971504166, + "grad_norm": 0.26056749835607534, + "learning_rate": 2.7061625219721845e-06, + "loss": 0.1639, + "num_tokens": 3922396555.0, + "step": 5143 + }, + { + "epoch": 7.017752552462847, + "grad_norm": 0.2609612112444019, + "learning_rate": 2.704233649369634e-06, + "loss": 0.1621, + "num_tokens": 3923089937.0, + "step": 5144 + }, + { + "epoch": 7.019118133421527, + "grad_norm": 0.2408503095942782, + "learning_rate": 2.7023073074461414e-06, + "loss": 0.1592, + "num_tokens": 3923873116.0, + "step": 5145 + }, + { + "epoch": 7.020483714380208, + "grad_norm": 0.2302042792915584, + "learning_rate": 2.7003834967893444e-06, + "loss": 0.1543, + "num_tokens": 3924601066.0, + "step": 5146 + }, + { + "epoch": 7.021849295338888, + "grad_norm": 0.23243747902932005, + "learning_rate": 2.6984622179861163e-06, + "loss": 0.1543, + "num_tokens": 3925279444.0, + "step": 5147 + }, + { + "epoch": 7.023214876297569, + "grad_norm": 0.237987625031753, + "learning_rate": 2.6965434716225597e-06, + "loss": 0.1521, + "num_tokens": 3925967991.0, + "step": 5148 + }, + { + "epoch": 7.024580457256249, + "grad_norm": 0.24567662864722933, + "learning_rate": 2.6946272582839953e-06, + "loss": 0.1754, + "num_tokens": 3926752285.0, + "step": 5149 + }, + { + "epoch": 7.025946038214929, + "grad_norm": 0.251826536142183, + "learning_rate": 2.6927135785549798e-06, + "loss": 0.1582, + "num_tokens": 3927524338.0, + "step": 5150 + }, + { + "epoch": 7.02731161917361, + "grad_norm": 0.23429222144023443, + "learning_rate": 2.690802433019293e-06, + "loss": 0.1699, + "num_tokens": 3928286419.0, + "step": 5151 + }, + { + "epoch": 7.02867720013229, + "grad_norm": 0.23202443241134343, + "learning_rate": 2.6888938222599447e-06, + "loss": 0.1603, + "num_tokens": 3929035407.0, + "step": 5152 + }, + { + "epoch": 7.030042781090971, + "grad_norm": 0.23188349587641513, + "learning_rate": 2.6869877468591647e-06, + "loss": 0.1536, + "num_tokens": 3929756090.0, + "step": 5153 + }, + { + "epoch": 7.0314083620496515, + "grad_norm": 0.22652472954190672, + "learning_rate": 2.6850842073984196e-06, + "loss": 0.1637, + "num_tokens": 3930505896.0, + "step": 5154 + }, + { + "epoch": 7.0327739430083325, + "grad_norm": 0.2253335940555467, + "learning_rate": 2.683183204458395e-06, + "loss": 0.1633, + "num_tokens": 3931276559.0, + "step": 5155 + }, + { + "epoch": 7.034139523967013, + "grad_norm": 0.229278951988845, + "learning_rate": 2.681284738619001e-06, + "loss": 0.1541, + "num_tokens": 3931979868.0, + "step": 5156 + }, + { + "epoch": 7.035505104925694, + "grad_norm": 0.22181494317720954, + "learning_rate": 2.6793888104593836e-06, + "loss": 0.1668, + "num_tokens": 3932781351.0, + "step": 5157 + }, + { + "epoch": 7.036870685884374, + "grad_norm": 0.22778508975804843, + "learning_rate": 2.677495420557904e-06, + "loss": 0.1577, + "num_tokens": 3933566298.0, + "step": 5158 + }, + { + "epoch": 7.038236266843054, + "grad_norm": 0.22143918963203504, + "learning_rate": 2.6756045694921535e-06, + "loss": 0.1603, + "num_tokens": 3934355893.0, + "step": 5159 + }, + { + "epoch": 7.039601847801735, + "grad_norm": 0.23947558008996012, + "learning_rate": 2.673716257838951e-06, + "loss": 0.1621, + "num_tokens": 3935154702.0, + "step": 5160 + }, + { + "epoch": 7.040967428760415, + "grad_norm": 0.2194913163946518, + "learning_rate": 2.6718304861743383e-06, + "loss": 0.1658, + "num_tokens": 3935899687.0, + "step": 5161 + }, + { + "epoch": 7.042333009719096, + "grad_norm": 0.21597011108233383, + "learning_rate": 2.6699472550735805e-06, + "loss": 0.1585, + "num_tokens": 3936679875.0, + "step": 5162 + }, + { + "epoch": 7.043698590677776, + "grad_norm": 0.2206469828122371, + "learning_rate": 2.6680665651111715e-06, + "loss": 0.155, + "num_tokens": 3937506676.0, + "step": 5163 + }, + { + "epoch": 7.045064171636457, + "grad_norm": 0.21895978992943885, + "learning_rate": 2.6661884168608293e-06, + "loss": 0.1684, + "num_tokens": 3938308883.0, + "step": 5164 + }, + { + "epoch": 7.046429752595137, + "grad_norm": 0.2248559524545943, + "learning_rate": 2.6643128108954914e-06, + "loss": 0.1607, + "num_tokens": 3939097617.0, + "step": 5165 + }, + { + "epoch": 7.047795333553818, + "grad_norm": 0.2109010106552991, + "learning_rate": 2.662439747787331e-06, + "loss": 0.1621, + "num_tokens": 3939971176.0, + "step": 5166 + }, + { + "epoch": 7.049160914512498, + "grad_norm": 0.23948638223562596, + "learning_rate": 2.660569228107731e-06, + "loss": 0.1604, + "num_tokens": 3940689815.0, + "step": 5167 + }, + { + "epoch": 7.050526495471178, + "grad_norm": 0.2245749301533297, + "learning_rate": 2.658701252427311e-06, + "loss": 0.1556, + "num_tokens": 3941426599.0, + "step": 5168 + }, + { + "epoch": 7.051892076429859, + "grad_norm": 0.218855980720438, + "learning_rate": 2.6568358213159085e-06, + "loss": 0.1634, + "num_tokens": 3942240856.0, + "step": 5169 + }, + { + "epoch": 7.0532576573885395, + "grad_norm": 0.22027254735141802, + "learning_rate": 2.6549729353425856e-06, + "loss": 0.1593, + "num_tokens": 3942975016.0, + "step": 5170 + }, + { + "epoch": 7.0546232383472205, + "grad_norm": 0.23659433847062597, + "learning_rate": 2.6531125950756258e-06, + "loss": 0.1575, + "num_tokens": 3943661683.0, + "step": 5171 + }, + { + "epoch": 7.055988819305901, + "grad_norm": 0.202722343273409, + "learning_rate": 2.6512548010825406e-06, + "loss": 0.1527, + "num_tokens": 3944516478.0, + "step": 5172 + }, + { + "epoch": 7.057354400264582, + "grad_norm": 0.2110697820299748, + "learning_rate": 2.649399553930064e-06, + "loss": 0.1622, + "num_tokens": 3945337931.0, + "step": 5173 + }, + { + "epoch": 7.058719981223262, + "grad_norm": 0.2267294770874837, + "learning_rate": 2.647546854184147e-06, + "loss": 0.1596, + "num_tokens": 3946072265.0, + "step": 5174 + }, + { + "epoch": 7.060085562181943, + "grad_norm": 0.21976384560743312, + "learning_rate": 2.6456967024099737e-06, + "loss": 0.1549, + "num_tokens": 3946816635.0, + "step": 5175 + }, + { + "epoch": 7.061451143140623, + "grad_norm": 0.22581418527163802, + "learning_rate": 2.643849099171941e-06, + "loss": 0.1638, + "num_tokens": 3947525486.0, + "step": 5176 + }, + { + "epoch": 7.062816724099303, + "grad_norm": 0.2239818094998691, + "learning_rate": 2.642004045033674e-06, + "loss": 0.1546, + "num_tokens": 3948230953.0, + "step": 5177 + }, + { + "epoch": 7.064182305057984, + "grad_norm": 0.2263616912241683, + "learning_rate": 2.6401615405580193e-06, + "loss": 0.1713, + "num_tokens": 3949094737.0, + "step": 5178 + }, + { + "epoch": 7.065547886016664, + "grad_norm": 0.2261815712619795, + "learning_rate": 2.6383215863070443e-06, + "loss": 0.159, + "num_tokens": 3949846816.0, + "step": 5179 + }, + { + "epoch": 7.066913466975345, + "grad_norm": 0.21566833862480214, + "learning_rate": 2.636484182842039e-06, + "loss": 0.1587, + "num_tokens": 3950649749.0, + "step": 5180 + }, + { + "epoch": 7.068279047934025, + "grad_norm": 0.21997235710927188, + "learning_rate": 2.634649330723517e-06, + "loss": 0.1652, + "num_tokens": 3951497077.0, + "step": 5181 + }, + { + "epoch": 7.069644628892706, + "grad_norm": 0.22798623738741908, + "learning_rate": 2.6328170305112134e-06, + "loss": 0.1679, + "num_tokens": 3952241781.0, + "step": 5182 + }, + { + "epoch": 7.071010209851386, + "grad_norm": 0.2349440985953012, + "learning_rate": 2.630987282764078e-06, + "loss": 0.1511, + "num_tokens": 3952888798.0, + "step": 5183 + }, + { + "epoch": 7.072375790810067, + "grad_norm": 0.21297618351054204, + "learning_rate": 2.6291600880402957e-06, + "loss": 0.1658, + "num_tokens": 3953662463.0, + "step": 5184 + }, + { + "epoch": 7.073741371768747, + "grad_norm": 0.24043425374124985, + "learning_rate": 2.6273354468972576e-06, + "loss": 0.1589, + "num_tokens": 3954346863.0, + "step": 5185 + }, + { + "epoch": 7.0751069527274275, + "grad_norm": 0.2205573084623806, + "learning_rate": 2.6255133598915864e-06, + "loss": 0.1633, + "num_tokens": 3955074556.0, + "step": 5186 + }, + { + "epoch": 7.0764725336861085, + "grad_norm": 0.23463607360695565, + "learning_rate": 2.6236938275791212e-06, + "loss": 0.1606, + "num_tokens": 3955813277.0, + "step": 5187 + }, + { + "epoch": 7.077838114644789, + "grad_norm": 0.2164812569168557, + "learning_rate": 2.621876850514922e-06, + "loss": 0.1619, + "num_tokens": 3956612099.0, + "step": 5188 + }, + { + "epoch": 7.07920369560347, + "grad_norm": 0.2304694743916733, + "learning_rate": 2.6200624292532693e-06, + "loss": 0.1573, + "num_tokens": 3957356102.0, + "step": 5189 + }, + { + "epoch": 7.08056927656215, + "grad_norm": 0.210083015543732, + "learning_rate": 2.6182505643476653e-06, + "loss": 0.1547, + "num_tokens": 3958132376.0, + "step": 5190 + }, + { + "epoch": 7.081934857520831, + "grad_norm": 0.21265406866850053, + "learning_rate": 2.616441256350831e-06, + "loss": 0.1635, + "num_tokens": 3959046671.0, + "step": 5191 + }, + { + "epoch": 7.083300438479511, + "grad_norm": 0.22639313136582226, + "learning_rate": 2.6146345058147094e-06, + "loss": 0.1613, + "num_tokens": 3959795343.0, + "step": 5192 + }, + { + "epoch": 7.084666019438192, + "grad_norm": 0.2173381626551724, + "learning_rate": 2.612830313290462e-06, + "loss": 0.1649, + "num_tokens": 3960626269.0, + "step": 5193 + }, + { + "epoch": 7.086031600396872, + "grad_norm": 0.2251812967458691, + "learning_rate": 2.6110286793284663e-06, + "loss": 0.1586, + "num_tokens": 3961374886.0, + "step": 5194 + }, + { + "epoch": 7.087397181355552, + "grad_norm": 0.22030722938437478, + "learning_rate": 2.609229604478326e-06, + "loss": 0.1576, + "num_tokens": 3962085468.0, + "step": 5195 + }, + { + "epoch": 7.088762762314233, + "grad_norm": 0.22405748329246722, + "learning_rate": 2.6074330892888606e-06, + "loss": 0.1726, + "num_tokens": 3962855172.0, + "step": 5196 + }, + { + "epoch": 7.090128343272913, + "grad_norm": 0.22958829850118415, + "learning_rate": 2.6056391343081078e-06, + "loss": 0.1623, + "num_tokens": 3963592741.0, + "step": 5197 + }, + { + "epoch": 7.091493924231594, + "grad_norm": 0.23287554962957313, + "learning_rate": 2.6038477400833255e-06, + "loss": 0.1611, + "num_tokens": 3964312617.0, + "step": 5198 + }, + { + "epoch": 7.092859505190274, + "grad_norm": 0.2194342354513781, + "learning_rate": 2.6020589071609923e-06, + "loss": 0.1655, + "num_tokens": 3965119639.0, + "step": 5199 + }, + { + "epoch": 7.094225086148955, + "grad_norm": 0.22238544543804212, + "learning_rate": 2.600272636086802e-06, + "loss": 0.1583, + "num_tokens": 3965902789.0, + "step": 5200 + }, + { + "epoch": 7.095590667107635, + "grad_norm": 0.2318905145919968, + "learning_rate": 2.5984889274056707e-06, + "loss": 0.1627, + "num_tokens": 3966652780.0, + "step": 5201 + }, + { + "epoch": 7.096956248066316, + "grad_norm": 0.22516352471965273, + "learning_rate": 2.5967077816617274e-06, + "loss": 0.1623, + "num_tokens": 3967399168.0, + "step": 5202 + }, + { + "epoch": 7.0983218290249965, + "grad_norm": 0.23554199592559583, + "learning_rate": 2.594929199398324e-06, + "loss": 0.1635, + "num_tokens": 3968127319.0, + "step": 5203 + }, + { + "epoch": 7.099687409983677, + "grad_norm": 0.23809458286736648, + "learning_rate": 2.59315318115803e-06, + "loss": 0.1645, + "num_tokens": 3968875999.0, + "step": 5204 + }, + { + "epoch": 7.101052990942358, + "grad_norm": 0.22225232994996177, + "learning_rate": 2.591379727482632e-06, + "loss": 0.1621, + "num_tokens": 3969653185.0, + "step": 5205 + }, + { + "epoch": 7.102418571901038, + "grad_norm": 0.23196173815933868, + "learning_rate": 2.5896088389131314e-06, + "loss": 0.1502, + "num_tokens": 3970346946.0, + "step": 5206 + }, + { + "epoch": 7.103784152859719, + "grad_norm": 0.22221085435665333, + "learning_rate": 2.58784051598975e-06, + "loss": 0.1577, + "num_tokens": 3971090741.0, + "step": 5207 + }, + { + "epoch": 7.105149733818399, + "grad_norm": 0.2248256876182048, + "learning_rate": 2.5860747592519293e-06, + "loss": 0.1595, + "num_tokens": 3971791525.0, + "step": 5208 + }, + { + "epoch": 7.10651531477708, + "grad_norm": 0.22753378080095796, + "learning_rate": 2.5843115692383224e-06, + "loss": 0.1654, + "num_tokens": 3972587498.0, + "step": 5209 + }, + { + "epoch": 7.10788089573576, + "grad_norm": 0.2278142852027865, + "learning_rate": 2.582550946486805e-06, + "loss": 0.1681, + "num_tokens": 3973395546.0, + "step": 5210 + }, + { + "epoch": 7.109246476694441, + "grad_norm": 0.2250530064224165, + "learning_rate": 2.580792891534463e-06, + "loss": 0.1587, + "num_tokens": 3974158046.0, + "step": 5211 + }, + { + "epoch": 7.110612057653121, + "grad_norm": 0.21968387253246166, + "learning_rate": 2.579037404917606e-06, + "loss": 0.1622, + "num_tokens": 3974918711.0, + "step": 5212 + }, + { + "epoch": 7.111977638611801, + "grad_norm": 0.23101531042835302, + "learning_rate": 2.5772844871717546e-06, + "loss": 0.1582, + "num_tokens": 3975619096.0, + "step": 5213 + }, + { + "epoch": 7.113343219570482, + "grad_norm": 0.20624916335555435, + "learning_rate": 2.5755341388316514e-06, + "loss": 0.1586, + "num_tokens": 3976411812.0, + "step": 5214 + }, + { + "epoch": 7.114708800529162, + "grad_norm": 0.23062449719568262, + "learning_rate": 2.573786360431247e-06, + "loss": 0.1614, + "num_tokens": 3977190015.0, + "step": 5215 + }, + { + "epoch": 7.116074381487843, + "grad_norm": 0.21406332803300715, + "learning_rate": 2.572041152503718e-06, + "loss": 0.159, + "num_tokens": 3978011404.0, + "step": 5216 + }, + { + "epoch": 7.117439962446523, + "grad_norm": 0.24138511775439844, + "learning_rate": 2.5702985155814504e-06, + "loss": 0.1655, + "num_tokens": 3978749431.0, + "step": 5217 + }, + { + "epoch": 7.118805543405204, + "grad_norm": 0.24262292457555437, + "learning_rate": 2.5685584501960438e-06, + "loss": 0.1588, + "num_tokens": 3979440228.0, + "step": 5218 + }, + { + "epoch": 7.1201711243638846, + "grad_norm": 0.22766804576432892, + "learning_rate": 2.5668209568783214e-06, + "loss": 0.1585, + "num_tokens": 3980106663.0, + "step": 5219 + }, + { + "epoch": 7.1215367053225656, + "grad_norm": 0.22805898595558508, + "learning_rate": 2.5650860361583147e-06, + "loss": 0.1581, + "num_tokens": 3980890384.0, + "step": 5220 + }, + { + "epoch": 7.122902286281246, + "grad_norm": 0.238310283347779, + "learning_rate": 2.563353688565272e-06, + "loss": 0.1628, + "num_tokens": 3981587170.0, + "step": 5221 + }, + { + "epoch": 7.124267867239926, + "grad_norm": 0.23953254611320593, + "learning_rate": 2.5616239146276593e-06, + "loss": 0.1686, + "num_tokens": 3982355916.0, + "step": 5222 + }, + { + "epoch": 7.125633448198607, + "grad_norm": 0.2332702781336164, + "learning_rate": 2.5598967148731567e-06, + "loss": 0.1583, + "num_tokens": 3983142109.0, + "step": 5223 + }, + { + "epoch": 7.126999029157287, + "grad_norm": 0.2104882634467482, + "learning_rate": 2.558172089828654e-06, + "loss": 0.1614, + "num_tokens": 3983957122.0, + "step": 5224 + }, + { + "epoch": 7.128364610115968, + "grad_norm": 0.22478609093948457, + "learning_rate": 2.5564500400202645e-06, + "loss": 0.1536, + "num_tokens": 3984734339.0, + "step": 5225 + }, + { + "epoch": 7.129730191074648, + "grad_norm": 0.22089115860234507, + "learning_rate": 2.5547305659733083e-06, + "loss": 0.1565, + "num_tokens": 3985481119.0, + "step": 5226 + }, + { + "epoch": 7.131095772033329, + "grad_norm": 0.2533581682058892, + "learning_rate": 2.553013668212322e-06, + "loss": 0.166, + "num_tokens": 3986131475.0, + "step": 5227 + }, + { + "epoch": 7.132461352992009, + "grad_norm": 0.22128082885473202, + "learning_rate": 2.5512993472610593e-06, + "loss": 0.159, + "num_tokens": 3986911871.0, + "step": 5228 + }, + { + "epoch": 7.13382693395069, + "grad_norm": 0.2294642599903656, + "learning_rate": 2.5495876036424823e-06, + "loss": 0.1583, + "num_tokens": 3987664281.0, + "step": 5229 + }, + { + "epoch": 7.13519251490937, + "grad_norm": 0.2323530715266377, + "learning_rate": 2.5478784378787715e-06, + "loss": 0.1535, + "num_tokens": 3988414397.0, + "step": 5230 + }, + { + "epoch": 7.13655809586805, + "grad_norm": 0.21794813650051617, + "learning_rate": 2.5461718504913173e-06, + "loss": 0.1617, + "num_tokens": 3989206827.0, + "step": 5231 + }, + { + "epoch": 7.137923676826731, + "grad_norm": 0.22479258563496338, + "learning_rate": 2.5444678420007305e-06, + "loss": 0.161, + "num_tokens": 3989997720.0, + "step": 5232 + }, + { + "epoch": 7.1392892577854115, + "grad_norm": 0.23352209764530527, + "learning_rate": 2.5427664129268253e-06, + "loss": 0.1525, + "num_tokens": 3990729886.0, + "step": 5233 + }, + { + "epoch": 7.1406548387440925, + "grad_norm": 0.21981530163019, + "learning_rate": 2.5410675637886362e-06, + "loss": 0.1623, + "num_tokens": 3991510829.0, + "step": 5234 + }, + { + "epoch": 7.142020419702773, + "grad_norm": 0.22936975813264698, + "learning_rate": 2.539371295104409e-06, + "loss": 0.1533, + "num_tokens": 3992281046.0, + "step": 5235 + }, + { + "epoch": 7.143386000661454, + "grad_norm": 0.2681033702598507, + "learning_rate": 2.537677607391598e-06, + "loss": 0.1624, + "num_tokens": 3993014490.0, + "step": 5236 + }, + { + "epoch": 7.144751581620134, + "grad_norm": 0.22656889956042275, + "learning_rate": 2.5359865011668803e-06, + "loss": 0.1667, + "num_tokens": 3993826218.0, + "step": 5237 + }, + { + "epoch": 7.146117162578815, + "grad_norm": 0.2641051711380781, + "learning_rate": 2.5342979769461353e-06, + "loss": 0.1611, + "num_tokens": 3994579875.0, + "step": 5238 + }, + { + "epoch": 7.147482743537495, + "grad_norm": 0.2537911145543262, + "learning_rate": 2.5326120352444573e-06, + "loss": 0.1594, + "num_tokens": 3995277028.0, + "step": 5239 + }, + { + "epoch": 7.148848324496175, + "grad_norm": 0.2279765042022696, + "learning_rate": 2.5309286765761586e-06, + "loss": 0.164, + "num_tokens": 3996081399.0, + "step": 5240 + }, + { + "epoch": 7.150213905454856, + "grad_norm": 0.23600250734682454, + "learning_rate": 2.529247901454755e-06, + "loss": 0.1563, + "num_tokens": 3996773342.0, + "step": 5241 + }, + { + "epoch": 7.151579486413536, + "grad_norm": 0.22915115782623457, + "learning_rate": 2.527569710392979e-06, + "loss": 0.1598, + "num_tokens": 3997472013.0, + "step": 5242 + }, + { + "epoch": 7.152945067372217, + "grad_norm": 0.22307889358969749, + "learning_rate": 2.525894103902775e-06, + "loss": 0.1598, + "num_tokens": 3998229257.0, + "step": 5243 + }, + { + "epoch": 7.154310648330897, + "grad_norm": 0.22316350163590268, + "learning_rate": 2.524221082495298e-06, + "loss": 0.1625, + "num_tokens": 3999018092.0, + "step": 5244 + }, + { + "epoch": 7.155676229289578, + "grad_norm": 0.22618392031497345, + "learning_rate": 2.5225506466809125e-06, + "loss": 0.1629, + "num_tokens": 3999806287.0, + "step": 5245 + }, + { + "epoch": 7.157041810248258, + "grad_norm": 0.216465123784099, + "learning_rate": 2.5208827969691994e-06, + "loss": 0.1634, + "num_tokens": 4000646441.0, + "step": 5246 + }, + { + "epoch": 7.158407391206939, + "grad_norm": 0.23222283325415982, + "learning_rate": 2.519217533868944e-06, + "loss": 0.1648, + "num_tokens": 4001419788.0, + "step": 5247 + }, + { + "epoch": 7.159772972165619, + "grad_norm": 0.2260348493081412, + "learning_rate": 2.5175548578881482e-06, + "loss": 0.165, + "num_tokens": 4002187722.0, + "step": 5248 + }, + { + "epoch": 7.1611385531242995, + "grad_norm": 0.22481753552392533, + "learning_rate": 2.5158947695340237e-06, + "loss": 0.1576, + "num_tokens": 4002967758.0, + "step": 5249 + }, + { + "epoch": 7.1625041340829805, + "grad_norm": 0.24320126832934552, + "learning_rate": 2.5142372693129876e-06, + "loss": 0.1681, + "num_tokens": 4003709428.0, + "step": 5250 + }, + { + "epoch": 7.163869715041661, + "grad_norm": 0.2273117610380449, + "learning_rate": 2.5125823577306736e-06, + "loss": 0.1621, + "num_tokens": 4004456712.0, + "step": 5251 + }, + { + "epoch": 7.165235296000342, + "grad_norm": 0.2192133133790806, + "learning_rate": 2.5109300352919237e-06, + "loss": 0.158, + "num_tokens": 4005232657.0, + "step": 5252 + }, + { + "epoch": 7.166600876959022, + "grad_norm": 0.21636681745064829, + "learning_rate": 2.5092803025007906e-06, + "loss": 0.1585, + "num_tokens": 4005948120.0, + "step": 5253 + }, + { + "epoch": 7.167966457917703, + "grad_norm": 0.21403086624264714, + "learning_rate": 2.507633159860533e-06, + "loss": 0.1589, + "num_tokens": 4006749220.0, + "step": 5254 + }, + { + "epoch": 7.169332038876383, + "grad_norm": 0.22371140608785042, + "learning_rate": 2.5059886078736278e-06, + "loss": 0.163, + "num_tokens": 4007549096.0, + "step": 5255 + }, + { + "epoch": 7.170697619835064, + "grad_norm": 0.21789759591846056, + "learning_rate": 2.504346647041752e-06, + "loss": 0.1612, + "num_tokens": 4008362989.0, + "step": 5256 + }, + { + "epoch": 7.172063200793744, + "grad_norm": 0.2317596232219424, + "learning_rate": 2.502707277865799e-06, + "loss": 0.168, + "num_tokens": 4009074208.0, + "step": 5257 + }, + { + "epoch": 7.173428781752424, + "grad_norm": 0.2238834990276597, + "learning_rate": 2.5010705008458715e-06, + "loss": 0.165, + "num_tokens": 4009863877.0, + "step": 5258 + }, + { + "epoch": 7.174794362711105, + "grad_norm": 0.21399006926804387, + "learning_rate": 2.499436316481275e-06, + "loss": 0.1617, + "num_tokens": 4010680283.0, + "step": 5259 + }, + { + "epoch": 7.176159943669785, + "grad_norm": 0.2324779109832412, + "learning_rate": 2.497804725270529e-06, + "loss": 0.1669, + "num_tokens": 4011489335.0, + "step": 5260 + }, + { + "epoch": 7.177525524628466, + "grad_norm": 0.2184619909264437, + "learning_rate": 2.4961757277113633e-06, + "loss": 0.1694, + "num_tokens": 4012313455.0, + "step": 5261 + }, + { + "epoch": 7.178891105587146, + "grad_norm": 0.22027062118564952, + "learning_rate": 2.494549324300715e-06, + "loss": 0.1599, + "num_tokens": 4013077802.0, + "step": 5262 + }, + { + "epoch": 7.180256686545827, + "grad_norm": 0.22864368584077188, + "learning_rate": 2.4929255155347258e-06, + "loss": 0.1653, + "num_tokens": 4013828045.0, + "step": 5263 + }, + { + "epoch": 7.181622267504507, + "grad_norm": 0.3161269824476037, + "learning_rate": 2.4913043019087546e-06, + "loss": 0.1527, + "num_tokens": 4014546381.0, + "step": 5264 + }, + { + "epoch": 7.182987848463188, + "grad_norm": 0.23327569593137984, + "learning_rate": 2.48968568391736e-06, + "loss": 0.1625, + "num_tokens": 4015265409.0, + "step": 5265 + }, + { + "epoch": 7.1843534294218685, + "grad_norm": 0.2292400815405088, + "learning_rate": 2.488069662054311e-06, + "loss": 0.1571, + "num_tokens": 4016022785.0, + "step": 5266 + }, + { + "epoch": 7.185719010380549, + "grad_norm": 0.2265083987802442, + "learning_rate": 2.48645623681259e-06, + "loss": 0.1688, + "num_tokens": 4016812196.0, + "step": 5267 + }, + { + "epoch": 7.18708459133923, + "grad_norm": 0.24716669188490173, + "learning_rate": 2.4848454086843796e-06, + "loss": 0.1671, + "num_tokens": 4017542592.0, + "step": 5268 + }, + { + "epoch": 7.18845017229791, + "grad_norm": 0.24597092514254212, + "learning_rate": 2.483237178161075e-06, + "loss": 0.1634, + "num_tokens": 4018279113.0, + "step": 5269 + }, + { + "epoch": 7.189815753256591, + "grad_norm": 0.22723541125809757, + "learning_rate": 2.4816315457332773e-06, + "loss": 0.1605, + "num_tokens": 4019025885.0, + "step": 5270 + }, + { + "epoch": 7.191181334215271, + "grad_norm": 0.22242185057524128, + "learning_rate": 2.4800285118907976e-06, + "loss": 0.1588, + "num_tokens": 4019803771.0, + "step": 5271 + }, + { + "epoch": 7.192546915173952, + "grad_norm": 0.24690624153151042, + "learning_rate": 2.4784280771226486e-06, + "loss": 0.1656, + "num_tokens": 4020500025.0, + "step": 5272 + }, + { + "epoch": 7.193912496132632, + "grad_norm": 0.22719994551371867, + "learning_rate": 2.476830241917056e-06, + "loss": 0.1608, + "num_tokens": 4021267421.0, + "step": 5273 + }, + { + "epoch": 7.195278077091313, + "grad_norm": 0.2371264535056204, + "learning_rate": 2.4752350067614485e-06, + "loss": 0.1674, + "num_tokens": 4022055079.0, + "step": 5274 + }, + { + "epoch": 7.196643658049993, + "grad_norm": 0.22752387890536868, + "learning_rate": 2.4736423721424653e-06, + "loss": 0.1638, + "num_tokens": 4022786569.0, + "step": 5275 + }, + { + "epoch": 7.198009239008673, + "grad_norm": 0.23308728200666176, + "learning_rate": 2.47205233854595e-06, + "loss": 0.1721, + "num_tokens": 4023563415.0, + "step": 5276 + }, + { + "epoch": 7.199374819967354, + "grad_norm": 0.23296033400524366, + "learning_rate": 2.4704649064569504e-06, + "loss": 0.1703, + "num_tokens": 4024330662.0, + "step": 5277 + }, + { + "epoch": 7.200740400926034, + "grad_norm": 0.2368478284650606, + "learning_rate": 2.468880076359726e-06, + "loss": 0.1585, + "num_tokens": 4025054989.0, + "step": 5278 + }, + { + "epoch": 7.202105981884715, + "grad_norm": 0.2276077872446591, + "learning_rate": 2.467297848737739e-06, + "loss": 0.1614, + "num_tokens": 4025859833.0, + "step": 5279 + }, + { + "epoch": 7.203471562843395, + "grad_norm": 0.22553703367971492, + "learning_rate": 2.4657182240736576e-06, + "loss": 0.1559, + "num_tokens": 4026618808.0, + "step": 5280 + }, + { + "epoch": 7.204837143802076, + "grad_norm": 0.36551394694679606, + "learning_rate": 2.4641412028493593e-06, + "loss": 0.1591, + "num_tokens": 4027365038.0, + "step": 5281 + }, + { + "epoch": 7.2062027247607565, + "grad_norm": 0.22689274444197852, + "learning_rate": 2.4625667855459225e-06, + "loss": 0.1639, + "num_tokens": 4028098243.0, + "step": 5282 + }, + { + "epoch": 7.2075683057194375, + "grad_norm": 0.22890450017408542, + "learning_rate": 2.4609949726436347e-06, + "loss": 0.1663, + "num_tokens": 4028896408.0, + "step": 5283 + }, + { + "epoch": 7.208933886678118, + "grad_norm": 0.2215799617041883, + "learning_rate": 2.4594257646219873e-06, + "loss": 0.1637, + "num_tokens": 4029683429.0, + "step": 5284 + }, + { + "epoch": 7.210299467636798, + "grad_norm": 0.2390365603063388, + "learning_rate": 2.457859161959679e-06, + "loss": 0.1658, + "num_tokens": 4030399212.0, + "step": 5285 + }, + { + "epoch": 7.211665048595479, + "grad_norm": 0.22437021126708473, + "learning_rate": 2.456295165134612e-06, + "loss": 0.1634, + "num_tokens": 4031200427.0, + "step": 5286 + }, + { + "epoch": 7.213030629554159, + "grad_norm": 0.2209462206004101, + "learning_rate": 2.454733774623893e-06, + "loss": 0.1622, + "num_tokens": 4031969134.0, + "step": 5287 + }, + { + "epoch": 7.21439621051284, + "grad_norm": 0.22814355205257997, + "learning_rate": 2.4531749909038364e-06, + "loss": 0.1537, + "num_tokens": 4032688357.0, + "step": 5288 + }, + { + "epoch": 7.21576179147152, + "grad_norm": 0.2631458618420531, + "learning_rate": 2.451618814449956e-06, + "loss": 0.1593, + "num_tokens": 4033427241.0, + "step": 5289 + }, + { + "epoch": 7.217127372430201, + "grad_norm": 0.2615774986671202, + "learning_rate": 2.450065245736979e-06, + "loss": 0.1632, + "num_tokens": 4034162179.0, + "step": 5290 + }, + { + "epoch": 7.218492953388881, + "grad_norm": 0.23708831829956115, + "learning_rate": 2.4485142852388277e-06, + "loss": 0.1618, + "num_tokens": 4034930423.0, + "step": 5291 + }, + { + "epoch": 7.219858534347562, + "grad_norm": 0.22705822431658143, + "learning_rate": 2.446965933428635e-06, + "loss": 0.1637, + "num_tokens": 4035742098.0, + "step": 5292 + }, + { + "epoch": 7.221224115306242, + "grad_norm": 0.22801416477062525, + "learning_rate": 2.4454201907787364e-06, + "loss": 0.163, + "num_tokens": 4036466032.0, + "step": 5293 + }, + { + "epoch": 7.222589696264922, + "grad_norm": 0.2242775449492381, + "learning_rate": 2.4438770577606704e-06, + "loss": 0.1674, + "num_tokens": 4037337559.0, + "step": 5294 + }, + { + "epoch": 7.223955277223603, + "grad_norm": 0.22717084360874654, + "learning_rate": 2.4423365348451793e-06, + "loss": 0.1573, + "num_tokens": 4038085019.0, + "step": 5295 + }, + { + "epoch": 7.225320858182283, + "grad_norm": 0.23149831208451657, + "learning_rate": 2.4407986225022108e-06, + "loss": 0.1652, + "num_tokens": 4038864870.0, + "step": 5296 + }, + { + "epoch": 7.226686439140964, + "grad_norm": 0.22447481271388975, + "learning_rate": 2.4392633212009144e-06, + "loss": 0.1586, + "num_tokens": 4039598186.0, + "step": 5297 + }, + { + "epoch": 7.2280520200996445, + "grad_norm": 0.21824743188015736, + "learning_rate": 2.437730631409644e-06, + "loss": 0.1594, + "num_tokens": 4040391786.0, + "step": 5298 + }, + { + "epoch": 7.2294176010583255, + "grad_norm": 0.23087681602589674, + "learning_rate": 2.4362005535959604e-06, + "loss": 0.1608, + "num_tokens": 4041200871.0, + "step": 5299 + }, + { + "epoch": 7.230783182017006, + "grad_norm": 0.2223645680214769, + "learning_rate": 2.4346730882266185e-06, + "loss": 0.1655, + "num_tokens": 4041987374.0, + "step": 5300 + }, + { + "epoch": 7.232148762975687, + "grad_norm": 0.22493951499596498, + "learning_rate": 2.433148235767586e-06, + "loss": 0.161, + "num_tokens": 4042730587.0, + "step": 5301 + }, + { + "epoch": 7.233514343934367, + "grad_norm": 0.2216519798201471, + "learning_rate": 2.4316259966840265e-06, + "loss": 0.1614, + "num_tokens": 4043475368.0, + "step": 5302 + }, + { + "epoch": 7.234879924893047, + "grad_norm": 0.2211717583966399, + "learning_rate": 2.4301063714403137e-06, + "loss": 0.1632, + "num_tokens": 4044282821.0, + "step": 5303 + }, + { + "epoch": 7.236245505851728, + "grad_norm": 0.22472182602398846, + "learning_rate": 2.428589360500013e-06, + "loss": 0.1688, + "num_tokens": 4045066716.0, + "step": 5304 + }, + { + "epoch": 7.237611086810408, + "grad_norm": 0.22729689636464273, + "learning_rate": 2.4270749643259047e-06, + "loss": 0.1542, + "num_tokens": 4045781706.0, + "step": 5305 + }, + { + "epoch": 7.238976667769089, + "grad_norm": 0.2241855072098184, + "learning_rate": 2.425563183379962e-06, + "loss": 0.1595, + "num_tokens": 4046590802.0, + "step": 5306 + }, + { + "epoch": 7.240342248727769, + "grad_norm": 0.2211750585622038, + "learning_rate": 2.4240540181233636e-06, + "loss": 0.1602, + "num_tokens": 4047347391.0, + "step": 5307 + }, + { + "epoch": 7.24170782968645, + "grad_norm": 0.22621664787651147, + "learning_rate": 2.422547469016494e-06, + "loss": 0.163, + "num_tokens": 4048077602.0, + "step": 5308 + }, + { + "epoch": 7.24307341064513, + "grad_norm": 0.23131585233650626, + "learning_rate": 2.4210435365189315e-06, + "loss": 0.16, + "num_tokens": 4048817737.0, + "step": 5309 + }, + { + "epoch": 7.244438991603811, + "grad_norm": 0.2310503677601857, + "learning_rate": 2.419542221089464e-06, + "loss": 0.1565, + "num_tokens": 4049550182.0, + "step": 5310 + }, + { + "epoch": 7.245804572562491, + "grad_norm": 0.21888698829919545, + "learning_rate": 2.418043523186078e-06, + "loss": 0.1575, + "num_tokens": 4050365159.0, + "step": 5311 + }, + { + "epoch": 7.247170153521171, + "grad_norm": 0.24971539304209103, + "learning_rate": 2.416547443265959e-06, + "loss": 0.1693, + "num_tokens": 4051172523.0, + "step": 5312 + }, + { + "epoch": 7.248535734479852, + "grad_norm": 0.2330722141250014, + "learning_rate": 2.4150539817854966e-06, + "loss": 0.1571, + "num_tokens": 4051956919.0, + "step": 5313 + }, + { + "epoch": 7.249901315438533, + "grad_norm": 0.21873435772319708, + "learning_rate": 2.4135631392002825e-06, + "loss": 0.1626, + "num_tokens": 4052825881.0, + "step": 5314 + }, + { + "epoch": 7.251266896397214, + "grad_norm": 0.22880063151490565, + "learning_rate": 2.412074915965108e-06, + "loss": 0.155, + "num_tokens": 4053491715.0, + "step": 5315 + }, + { + "epoch": 7.252632477355894, + "grad_norm": 0.2267466958426787, + "learning_rate": 2.410589312533964e-06, + "loss": 0.1697, + "num_tokens": 4054309973.0, + "step": 5316 + }, + { + "epoch": 7.253998058314575, + "grad_norm": 0.23871625729860377, + "learning_rate": 2.4091063293600465e-06, + "loss": 0.1574, + "num_tokens": 4055068646.0, + "step": 5317 + }, + { + "epoch": 7.255363639273255, + "grad_norm": 0.22096356224605743, + "learning_rate": 2.4076259668957475e-06, + "loss": 0.161, + "num_tokens": 4055864372.0, + "step": 5318 + }, + { + "epoch": 7.256729220231936, + "grad_norm": 0.22585800747202195, + "learning_rate": 2.4061482255926622e-06, + "loss": 0.1689, + "num_tokens": 4056684677.0, + "step": 5319 + }, + { + "epoch": 7.258094801190616, + "grad_norm": 0.23372806976057542, + "learning_rate": 2.4046731059015855e-06, + "loss": 0.1613, + "num_tokens": 4057429568.0, + "step": 5320 + }, + { + "epoch": 7.259460382149296, + "grad_norm": 0.217833926122154, + "learning_rate": 2.403200608272511e-06, + "loss": 0.1642, + "num_tokens": 4058243318.0, + "step": 5321 + }, + { + "epoch": 7.260825963107977, + "grad_norm": 0.22676100644288907, + "learning_rate": 2.4017307331546354e-06, + "loss": 0.1534, + "num_tokens": 4058996157.0, + "step": 5322 + }, + { + "epoch": 7.262191544066657, + "grad_norm": 0.229467206158808, + "learning_rate": 2.4002634809963536e-06, + "loss": 0.1605, + "num_tokens": 4059741185.0, + "step": 5323 + }, + { + "epoch": 7.263557125025338, + "grad_norm": 0.2289795731046091, + "learning_rate": 2.3987988522452625e-06, + "loss": 0.1619, + "num_tokens": 4060489501.0, + "step": 5324 + }, + { + "epoch": 7.264922705984018, + "grad_norm": 0.23930215142569702, + "learning_rate": 2.397336847348153e-06, + "loss": 0.1559, + "num_tokens": 4061238913.0, + "step": 5325 + }, + { + "epoch": 7.266288286942699, + "grad_norm": 0.22681187569688466, + "learning_rate": 2.395877466751023e-06, + "loss": 0.1608, + "num_tokens": 4062009300.0, + "step": 5326 + }, + { + "epoch": 7.267653867901379, + "grad_norm": 0.22637379630506999, + "learning_rate": 2.3944207108990656e-06, + "loss": 0.1663, + "num_tokens": 4062835571.0, + "step": 5327 + }, + { + "epoch": 7.26901944886006, + "grad_norm": 0.2237783607689323, + "learning_rate": 2.3929665802366713e-06, + "loss": 0.1638, + "num_tokens": 4063601960.0, + "step": 5328 + }, + { + "epoch": 7.2703850298187405, + "grad_norm": 0.23168507594702362, + "learning_rate": 2.3915150752074363e-06, + "loss": 0.1589, + "num_tokens": 4064380083.0, + "step": 5329 + }, + { + "epoch": 7.271750610777421, + "grad_norm": 0.2282930200963839, + "learning_rate": 2.3900661962541493e-06, + "loss": 0.1654, + "num_tokens": 4065134852.0, + "step": 5330 + }, + { + "epoch": 7.273116191736102, + "grad_norm": 0.22996725362412523, + "learning_rate": 2.3886199438188017e-06, + "loss": 0.1645, + "num_tokens": 4065922443.0, + "step": 5331 + }, + { + "epoch": 7.274481772694782, + "grad_norm": 0.2344034293655992, + "learning_rate": 2.387176318342581e-06, + "loss": 0.1519, + "num_tokens": 4066625748.0, + "step": 5332 + }, + { + "epoch": 7.275847353653463, + "grad_norm": 0.22724219842764998, + "learning_rate": 2.3857353202658782e-06, + "loss": 0.1595, + "num_tokens": 4067402938.0, + "step": 5333 + }, + { + "epoch": 7.277212934612143, + "grad_norm": 0.22901877165278292, + "learning_rate": 2.3842969500282747e-06, + "loss": 0.1617, + "num_tokens": 4068197958.0, + "step": 5334 + }, + { + "epoch": 7.278578515570824, + "grad_norm": 0.2360259926993058, + "learning_rate": 2.3828612080685576e-06, + "loss": 0.1667, + "num_tokens": 4068973786.0, + "step": 5335 + }, + { + "epoch": 7.279944096529504, + "grad_norm": 0.215950405257391, + "learning_rate": 2.3814280948247105e-06, + "loss": 0.1644, + "num_tokens": 4069788410.0, + "step": 5336 + }, + { + "epoch": 7.281309677488185, + "grad_norm": 0.23291084622261846, + "learning_rate": 2.379997610733912e-06, + "loss": 0.1734, + "num_tokens": 4070518045.0, + "step": 5337 + }, + { + "epoch": 7.282675258446865, + "grad_norm": 0.22664838769234846, + "learning_rate": 2.378569756232543e-06, + "loss": 0.1642, + "num_tokens": 4071321170.0, + "step": 5338 + }, + { + "epoch": 7.284040839405545, + "grad_norm": 0.2265070920137822, + "learning_rate": 2.3771445317561784e-06, + "loss": 0.1617, + "num_tokens": 4072146421.0, + "step": 5339 + }, + { + "epoch": 7.285406420364226, + "grad_norm": 0.23071296582891995, + "learning_rate": 2.3757219377395926e-06, + "loss": 0.1677, + "num_tokens": 4072943824.0, + "step": 5340 + }, + { + "epoch": 7.286772001322906, + "grad_norm": 0.22637830577661366, + "learning_rate": 2.374301974616758e-06, + "loss": 0.1675, + "num_tokens": 4073785930.0, + "step": 5341 + }, + { + "epoch": 7.288137582281587, + "grad_norm": 0.2353274327510552, + "learning_rate": 2.372884642820845e-06, + "loss": 0.161, + "num_tokens": 4074511130.0, + "step": 5342 + }, + { + "epoch": 7.289503163240267, + "grad_norm": 0.21453497905303526, + "learning_rate": 2.371469942784218e-06, + "loss": 0.1626, + "num_tokens": 4075253713.0, + "step": 5343 + }, + { + "epoch": 7.290868744198948, + "grad_norm": 0.2166187023405437, + "learning_rate": 2.3700578749384408e-06, + "loss": 0.1588, + "num_tokens": 4076045187.0, + "step": 5344 + }, + { + "epoch": 7.2922343251576285, + "grad_norm": 0.23532356898432044, + "learning_rate": 2.3686484397142755e-06, + "loss": 0.162, + "num_tokens": 4076796876.0, + "step": 5345 + }, + { + "epoch": 7.2935999061163095, + "grad_norm": 0.2443584448068937, + "learning_rate": 2.3672416375416794e-06, + "loss": 0.1671, + "num_tokens": 4077549042.0, + "step": 5346 + }, + { + "epoch": 7.29496548707499, + "grad_norm": 0.2287894595409, + "learning_rate": 2.3658374688498077e-06, + "loss": 0.1672, + "num_tokens": 4078340311.0, + "step": 5347 + }, + { + "epoch": 7.29633106803367, + "grad_norm": 0.23357606167993133, + "learning_rate": 2.3644359340670095e-06, + "loss": 0.1667, + "num_tokens": 4079079403.0, + "step": 5348 + }, + { + "epoch": 7.297696648992351, + "grad_norm": 0.2212286738350255, + "learning_rate": 2.363037033620832e-06, + "loss": 0.1689, + "num_tokens": 4079852914.0, + "step": 5349 + }, + { + "epoch": 7.299062229951031, + "grad_norm": 0.23843611969667422, + "learning_rate": 2.3616407679380235e-06, + "loss": 0.1586, + "num_tokens": 4080616859.0, + "step": 5350 + }, + { + "epoch": 7.300427810909712, + "grad_norm": 0.2337826034863261, + "learning_rate": 2.3602471374445183e-06, + "loss": 0.1558, + "num_tokens": 4081351601.0, + "step": 5351 + }, + { + "epoch": 7.301793391868392, + "grad_norm": 0.22928015222849152, + "learning_rate": 2.3588561425654562e-06, + "loss": 0.1627, + "num_tokens": 4082144677.0, + "step": 5352 + }, + { + "epoch": 7.303158972827073, + "grad_norm": 0.23484094153981833, + "learning_rate": 2.3574677837251695e-06, + "loss": 0.164, + "num_tokens": 4082902894.0, + "step": 5353 + }, + { + "epoch": 7.304524553785753, + "grad_norm": 0.2307795103464297, + "learning_rate": 2.356082061347186e-06, + "loss": 0.1624, + "num_tokens": 4083646842.0, + "step": 5354 + }, + { + "epoch": 7.305890134744434, + "grad_norm": 0.2227056997139569, + "learning_rate": 2.3546989758542275e-06, + "loss": 0.1647, + "num_tokens": 4084426267.0, + "step": 5355 + }, + { + "epoch": 7.307255715703114, + "grad_norm": 0.2069790503367002, + "learning_rate": 2.3533185276682183e-06, + "loss": 0.1618, + "num_tokens": 4085264453.0, + "step": 5356 + }, + { + "epoch": 7.308621296661794, + "grad_norm": 0.2341309399571618, + "learning_rate": 2.351940717210268e-06, + "loss": 0.1622, + "num_tokens": 4086013757.0, + "step": 5357 + }, + { + "epoch": 7.309986877620475, + "grad_norm": 0.2287620006214358, + "learning_rate": 2.3505655449006903e-06, + "loss": 0.1602, + "num_tokens": 4086749464.0, + "step": 5358 + }, + { + "epoch": 7.311352458579155, + "grad_norm": 0.21458961168624907, + "learning_rate": 2.34919301115899e-06, + "loss": 0.1639, + "num_tokens": 4087581399.0, + "step": 5359 + }, + { + "epoch": 7.312718039537836, + "grad_norm": 0.2238655309822278, + "learning_rate": 2.3478231164038668e-06, + "loss": 0.1681, + "num_tokens": 4088355418.0, + "step": 5360 + }, + { + "epoch": 7.3140836204965165, + "grad_norm": 0.23226191927930767, + "learning_rate": 2.3464558610532163e-06, + "loss": 0.1547, + "num_tokens": 4089107344.0, + "step": 5361 + }, + { + "epoch": 7.3154492014551975, + "grad_norm": 0.2256402917669762, + "learning_rate": 2.3450912455241297e-06, + "loss": 0.1636, + "num_tokens": 4089891100.0, + "step": 5362 + }, + { + "epoch": 7.316814782413878, + "grad_norm": 0.23318063664241018, + "learning_rate": 2.3437292702328923e-06, + "loss": 0.1552, + "num_tokens": 4090679412.0, + "step": 5363 + }, + { + "epoch": 7.318180363372559, + "grad_norm": 0.23139424280890059, + "learning_rate": 2.3423699355949827e-06, + "loss": 0.169, + "num_tokens": 4091462342.0, + "step": 5364 + }, + { + "epoch": 7.319545944331239, + "grad_norm": 0.2302311581162912, + "learning_rate": 2.3410132420250773e-06, + "loss": 0.1583, + "num_tokens": 4092183924.0, + "step": 5365 + }, + { + "epoch": 7.320911525289919, + "grad_norm": 0.22475481159438757, + "learning_rate": 2.3396591899370415e-06, + "loss": 0.1564, + "num_tokens": 4092989254.0, + "step": 5366 + }, + { + "epoch": 7.3222771062486, + "grad_norm": 0.23079883087516664, + "learning_rate": 2.3383077797439394e-06, + "loss": 0.1604, + "num_tokens": 4093707672.0, + "step": 5367 + }, + { + "epoch": 7.32364268720728, + "grad_norm": 0.21906490225736197, + "learning_rate": 2.3369590118580294e-06, + "loss": 0.165, + "num_tokens": 4094524170.0, + "step": 5368 + }, + { + "epoch": 7.325008268165961, + "grad_norm": 0.23469548578925717, + "learning_rate": 2.335612886690757e-06, + "loss": 0.1655, + "num_tokens": 4095282366.0, + "step": 5369 + }, + { + "epoch": 7.326373849124641, + "grad_norm": 0.21933136535897177, + "learning_rate": 2.3342694046527734e-06, + "loss": 0.1696, + "num_tokens": 4096062714.0, + "step": 5370 + }, + { + "epoch": 7.327739430083322, + "grad_norm": 0.2572986314417623, + "learning_rate": 2.332928566153912e-06, + "loss": 0.1644, + "num_tokens": 4096850799.0, + "step": 5371 + }, + { + "epoch": 7.329105011042002, + "grad_norm": 0.2323338807356399, + "learning_rate": 2.3315903716032066e-06, + "loss": 0.1643, + "num_tokens": 4097628559.0, + "step": 5372 + }, + { + "epoch": 7.330470592000683, + "grad_norm": 0.22421806750740247, + "learning_rate": 2.3302548214088826e-06, + "loss": 0.1591, + "num_tokens": 4098377219.0, + "step": 5373 + }, + { + "epoch": 7.331836172959363, + "grad_norm": 0.22688265931582377, + "learning_rate": 2.3289219159783568e-06, + "loss": 0.1619, + "num_tokens": 4099111959.0, + "step": 5374 + }, + { + "epoch": 7.333201753918043, + "grad_norm": 0.23351109651096505, + "learning_rate": 2.3275916557182427e-06, + "loss": 0.1577, + "num_tokens": 4099844682.0, + "step": 5375 + }, + { + "epoch": 7.334567334876724, + "grad_norm": 0.23526227399341407, + "learning_rate": 2.3262640410343433e-06, + "loss": 0.1646, + "num_tokens": 4100552257.0, + "step": 5376 + }, + { + "epoch": 7.3359329158354045, + "grad_norm": 0.2252775853811229, + "learning_rate": 2.3249390723316585e-06, + "loss": 0.1669, + "num_tokens": 4101346822.0, + "step": 5377 + }, + { + "epoch": 7.3372984967940855, + "grad_norm": 0.2368518468347841, + "learning_rate": 2.3236167500143763e-06, + "loss": 0.1693, + "num_tokens": 4102137010.0, + "step": 5378 + }, + { + "epoch": 7.338664077752766, + "grad_norm": 0.22933367550616685, + "learning_rate": 2.3222970744858835e-06, + "loss": 0.1638, + "num_tokens": 4102899674.0, + "step": 5379 + }, + { + "epoch": 7.340029658711447, + "grad_norm": 0.22509394643907243, + "learning_rate": 2.3209800461487527e-06, + "loss": 0.1597, + "num_tokens": 4103651815.0, + "step": 5380 + }, + { + "epoch": 7.341395239670127, + "grad_norm": 0.21611243261021196, + "learning_rate": 2.3196656654047533e-06, + "loss": 0.1608, + "num_tokens": 4104452462.0, + "step": 5381 + }, + { + "epoch": 7.342760820628808, + "grad_norm": 0.22231426537152488, + "learning_rate": 2.3183539326548474e-06, + "loss": 0.1631, + "num_tokens": 4105217468.0, + "step": 5382 + }, + { + "epoch": 7.344126401587488, + "grad_norm": 0.22383810394621334, + "learning_rate": 2.3170448482991852e-06, + "loss": 0.1594, + "num_tokens": 4106020919.0, + "step": 5383 + }, + { + "epoch": 7.345491982546168, + "grad_norm": 0.24300616086360613, + "learning_rate": 2.3157384127371123e-06, + "loss": 0.1599, + "num_tokens": 4106748395.0, + "step": 5384 + }, + { + "epoch": 7.346857563504849, + "grad_norm": 0.22448727016194572, + "learning_rate": 2.314434626367167e-06, + "loss": 0.1649, + "num_tokens": 4107578130.0, + "step": 5385 + }, + { + "epoch": 7.348223144463529, + "grad_norm": 0.23279767918257874, + "learning_rate": 2.3131334895870787e-06, + "loss": 0.1637, + "num_tokens": 4108323312.0, + "step": 5386 + }, + { + "epoch": 7.34958872542221, + "grad_norm": 0.2228066973587561, + "learning_rate": 2.311835002793764e-06, + "loss": 0.1585, + "num_tokens": 4109011541.0, + "step": 5387 + }, + { + "epoch": 7.35095430638089, + "grad_norm": 0.22503864226069176, + "learning_rate": 2.3105391663833397e-06, + "loss": 0.1675, + "num_tokens": 4109776069.0, + "step": 5388 + }, + { + "epoch": 7.352319887339571, + "grad_norm": 0.2179322752831177, + "learning_rate": 2.309245980751107e-06, + "loss": 0.1614, + "num_tokens": 4110515018.0, + "step": 5389 + }, + { + "epoch": 7.353685468298251, + "grad_norm": 0.22407336722399782, + "learning_rate": 2.307955446291559e-06, + "loss": 0.1613, + "num_tokens": 4111244827.0, + "step": 5390 + }, + { + "epoch": 7.355051049256932, + "grad_norm": 0.2120790968746116, + "learning_rate": 2.3066675633983863e-06, + "loss": 0.1573, + "num_tokens": 4112066462.0, + "step": 5391 + }, + { + "epoch": 7.356416630215612, + "grad_norm": 0.21994429932186366, + "learning_rate": 2.305382332464462e-06, + "loss": 0.1561, + "num_tokens": 4112725106.0, + "step": 5392 + }, + { + "epoch": 7.3577822111742925, + "grad_norm": 0.22286752150438327, + "learning_rate": 2.304099753881857e-06, + "loss": 0.1624, + "num_tokens": 4113472777.0, + "step": 5393 + }, + { + "epoch": 7.3591477921329735, + "grad_norm": 0.2337219169651089, + "learning_rate": 2.30281982804183e-06, + "loss": 0.165, + "num_tokens": 4114192929.0, + "step": 5394 + }, + { + "epoch": 7.360513373091654, + "grad_norm": 0.22033486221257223, + "learning_rate": 2.301542555334831e-06, + "loss": 0.1657, + "num_tokens": 4115004909.0, + "step": 5395 + }, + { + "epoch": 7.361878954050335, + "grad_norm": 0.22885481365673466, + "learning_rate": 2.3002679361504974e-06, + "loss": 0.1566, + "num_tokens": 4115809696.0, + "step": 5396 + }, + { + "epoch": 7.363244535009015, + "grad_norm": 0.2410229724947732, + "learning_rate": 2.298995970877666e-06, + "loss": 0.1656, + "num_tokens": 4116574713.0, + "step": 5397 + }, + { + "epoch": 7.364610115967696, + "grad_norm": 0.25609195964022596, + "learning_rate": 2.2977266599043553e-06, + "loss": 0.1751, + "num_tokens": 4117285435.0, + "step": 5398 + }, + { + "epoch": 7.365975696926376, + "grad_norm": 0.20931166650395885, + "learning_rate": 2.296460003617776e-06, + "loss": 0.1573, + "num_tokens": 4118112486.0, + "step": 5399 + }, + { + "epoch": 7.367341277885057, + "grad_norm": 0.23441735239791875, + "learning_rate": 2.295196002404332e-06, + "loss": 0.1705, + "num_tokens": 4118988403.0, + "step": 5400 + }, + { + "epoch": 7.368706858843737, + "grad_norm": 0.21892879444077784, + "learning_rate": 2.2939346566496142e-06, + "loss": 0.1657, + "num_tokens": 4119800262.0, + "step": 5401 + }, + { + "epoch": 7.370072439802417, + "grad_norm": 0.22531128149611235, + "learning_rate": 2.2926759667384056e-06, + "loss": 0.1673, + "num_tokens": 4120554871.0, + "step": 5402 + }, + { + "epoch": 7.371438020761098, + "grad_norm": 0.23767898264999146, + "learning_rate": 2.2914199330546764e-06, + "loss": 0.1656, + "num_tokens": 4121337220.0, + "step": 5403 + }, + { + "epoch": 7.372803601719778, + "grad_norm": 0.2633161649993797, + "learning_rate": 2.2901665559815914e-06, + "loss": 0.1614, + "num_tokens": 4122021848.0, + "step": 5404 + }, + { + "epoch": 7.374169182678459, + "grad_norm": 0.22081467390563517, + "learning_rate": 2.2889158359014978e-06, + "loss": 0.1603, + "num_tokens": 4122749179.0, + "step": 5405 + }, + { + "epoch": 7.375534763637139, + "grad_norm": 0.2518378470835453, + "learning_rate": 2.2876677731959374e-06, + "loss": 0.1575, + "num_tokens": 4123515932.0, + "step": 5406 + }, + { + "epoch": 7.37690034459582, + "grad_norm": 0.22287478526550192, + "learning_rate": 2.286422368245641e-06, + "loss": 0.1574, + "num_tokens": 4124270905.0, + "step": 5407 + }, + { + "epoch": 7.3782659255545004, + "grad_norm": 0.2361535247881203, + "learning_rate": 2.285179621430525e-06, + "loss": 0.1658, + "num_tokens": 4124975602.0, + "step": 5408 + }, + { + "epoch": 7.3796315065131814, + "grad_norm": 0.22577757546734017, + "learning_rate": 2.2839395331297028e-06, + "loss": 0.164, + "num_tokens": 4125744891.0, + "step": 5409 + }, + { + "epoch": 7.380997087471862, + "grad_norm": 0.23375317952488478, + "learning_rate": 2.282702103721467e-06, + "loss": 0.1658, + "num_tokens": 4126558346.0, + "step": 5410 + }, + { + "epoch": 7.382362668430542, + "grad_norm": 0.21472939844992567, + "learning_rate": 2.2814673335833055e-06, + "loss": 0.1607, + "num_tokens": 4127364544.0, + "step": 5411 + }, + { + "epoch": 7.383728249389223, + "grad_norm": 0.23498567082365704, + "learning_rate": 2.2802352230918927e-06, + "loss": 0.1586, + "num_tokens": 4128157771.0, + "step": 5412 + }, + { + "epoch": 7.385093830347903, + "grad_norm": 0.23007797008583242, + "learning_rate": 2.279005772623093e-06, + "loss": 0.1633, + "num_tokens": 4128873959.0, + "step": 5413 + }, + { + "epoch": 7.386459411306584, + "grad_norm": 0.2892705140450129, + "learning_rate": 2.2777789825519565e-06, + "loss": 0.1623, + "num_tokens": 4129575159.0, + "step": 5414 + }, + { + "epoch": 7.387824992265264, + "grad_norm": 0.23239039064603048, + "learning_rate": 2.2765548532527262e-06, + "loss": 0.1697, + "num_tokens": 4130343499.0, + "step": 5415 + }, + { + "epoch": 7.389190573223945, + "grad_norm": 0.23033456038534036, + "learning_rate": 2.275333385098831e-06, + "loss": 0.1687, + "num_tokens": 4131102787.0, + "step": 5416 + }, + { + "epoch": 7.390556154182625, + "grad_norm": 0.21873809480546852, + "learning_rate": 2.274114578462884e-06, + "loss": 0.1634, + "num_tokens": 4131898766.0, + "step": 5417 + }, + { + "epoch": 7.391921735141306, + "grad_norm": 0.22143957526553262, + "learning_rate": 2.2728984337166966e-06, + "loss": 0.159, + "num_tokens": 4132656536.0, + "step": 5418 + }, + { + "epoch": 7.393287316099986, + "grad_norm": 0.24054884608427313, + "learning_rate": 2.2716849512312556e-06, + "loss": 0.1635, + "num_tokens": 4133346237.0, + "step": 5419 + }, + { + "epoch": 7.394652897058666, + "grad_norm": 0.22532009039323486, + "learning_rate": 2.2704741313767463e-06, + "loss": 0.1652, + "num_tokens": 4134189582.0, + "step": 5420 + }, + { + "epoch": 7.396018478017347, + "grad_norm": 0.2157562469480373, + "learning_rate": 2.2692659745225364e-06, + "loss": 0.1552, + "num_tokens": 4134989699.0, + "step": 5421 + }, + { + "epoch": 7.397384058976027, + "grad_norm": 0.2219089794554517, + "learning_rate": 2.2680604810371797e-06, + "loss": 0.1609, + "num_tokens": 4135741480.0, + "step": 5422 + }, + { + "epoch": 7.398749639934708, + "grad_norm": 0.22826756704605286, + "learning_rate": 2.2668576512884237e-06, + "loss": 0.1645, + "num_tokens": 4136500074.0, + "step": 5423 + }, + { + "epoch": 7.4001152208933885, + "grad_norm": 0.24017276696125633, + "learning_rate": 2.2656574856431974e-06, + "loss": 0.1571, + "num_tokens": 4137173636.0, + "step": 5424 + }, + { + "epoch": 7.4014808018520695, + "grad_norm": 0.23686505140072578, + "learning_rate": 2.26445998446762e-06, + "loss": 0.1597, + "num_tokens": 4137924798.0, + "step": 5425 + }, + { + "epoch": 7.40284638281075, + "grad_norm": 0.22941482175217398, + "learning_rate": 2.263265148126997e-06, + "loss": 0.1614, + "num_tokens": 4138703041.0, + "step": 5426 + }, + { + "epoch": 7.404211963769431, + "grad_norm": 0.2262492556554366, + "learning_rate": 2.2620729769858224e-06, + "loss": 0.1658, + "num_tokens": 4139501233.0, + "step": 5427 + }, + { + "epoch": 7.405577544728111, + "grad_norm": 0.23557078668470724, + "learning_rate": 2.260883471407774e-06, + "loss": 0.1585, + "num_tokens": 4140262974.0, + "step": 5428 + }, + { + "epoch": 7.406943125686791, + "grad_norm": 0.2519025906470743, + "learning_rate": 2.2596966317557206e-06, + "loss": 0.1574, + "num_tokens": 4141023091.0, + "step": 5429 + }, + { + "epoch": 7.408308706645472, + "grad_norm": 0.22490592504843296, + "learning_rate": 2.2585124583917143e-06, + "loss": 0.1704, + "num_tokens": 4141814281.0, + "step": 5430 + }, + { + "epoch": 7.409674287604152, + "grad_norm": 0.22622606230475514, + "learning_rate": 2.2573309516769946e-06, + "loss": 0.1628, + "num_tokens": 4142606230.0, + "step": 5431 + }, + { + "epoch": 7.411039868562833, + "grad_norm": 0.2772416701984129, + "learning_rate": 2.256152111971989e-06, + "loss": 0.1629, + "num_tokens": 4143312583.0, + "step": 5432 + }, + { + "epoch": 7.412405449521513, + "grad_norm": 0.21021950980041723, + "learning_rate": 2.25497593963631e-06, + "loss": 0.1663, + "num_tokens": 4144215805.0, + "step": 5433 + }, + { + "epoch": 7.413771030480194, + "grad_norm": 0.23290846818078983, + "learning_rate": 2.2538024350287567e-06, + "loss": 0.1679, + "num_tokens": 4144951031.0, + "step": 5434 + }, + { + "epoch": 7.415136611438874, + "grad_norm": 0.22475026383591534, + "learning_rate": 2.252631598507316e-06, + "loss": 0.1595, + "num_tokens": 4145712658.0, + "step": 5435 + }, + { + "epoch": 7.416502192397555, + "grad_norm": 0.25808395729030437, + "learning_rate": 2.2514634304291586e-06, + "loss": 0.1683, + "num_tokens": 4146455701.0, + "step": 5436 + }, + { + "epoch": 7.417867773356235, + "grad_norm": 0.22431585702973372, + "learning_rate": 2.2502979311506407e-06, + "loss": 0.1617, + "num_tokens": 4147205041.0, + "step": 5437 + }, + { + "epoch": 7.419233354314915, + "grad_norm": 0.2216069727919839, + "learning_rate": 2.249135101027306e-06, + "loss": 0.1607, + "num_tokens": 4147930925.0, + "step": 5438 + }, + { + "epoch": 7.420598935273596, + "grad_norm": 0.22872514256900858, + "learning_rate": 2.247974940413886e-06, + "loss": 0.1648, + "num_tokens": 4148658370.0, + "step": 5439 + }, + { + "epoch": 7.4219645162322765, + "grad_norm": 0.21007083939862498, + "learning_rate": 2.246817449664292e-06, + "loss": 0.1632, + "num_tokens": 4149549310.0, + "step": 5440 + }, + { + "epoch": 7.4233300971909575, + "grad_norm": 0.23438433982900994, + "learning_rate": 2.2456626291316275e-06, + "loss": 0.1624, + "num_tokens": 4150300638.0, + "step": 5441 + }, + { + "epoch": 7.424695678149638, + "grad_norm": 0.2284251992642905, + "learning_rate": 2.2445104791681753e-06, + "loss": 0.1706, + "num_tokens": 4151143648.0, + "step": 5442 + }, + { + "epoch": 7.426061259108319, + "grad_norm": 0.2417271987505163, + "learning_rate": 2.2433610001254083e-06, + "loss": 0.1606, + "num_tokens": 4151838169.0, + "step": 5443 + }, + { + "epoch": 7.427426840066999, + "grad_norm": 0.22361063076221996, + "learning_rate": 2.2422141923539832e-06, + "loss": 0.1624, + "num_tokens": 4152620430.0, + "step": 5444 + }, + { + "epoch": 7.42879242102568, + "grad_norm": 0.2284761918129643, + "learning_rate": 2.2410700562037396e-06, + "loss": 0.1607, + "num_tokens": 4153337195.0, + "step": 5445 + }, + { + "epoch": 7.43015800198436, + "grad_norm": 0.23745975542986758, + "learning_rate": 2.239928592023705e-06, + "loss": 0.1637, + "num_tokens": 4154082976.0, + "step": 5446 + }, + { + "epoch": 7.43152358294304, + "grad_norm": 0.2298311767451642, + "learning_rate": 2.2387898001620907e-06, + "loss": 0.1556, + "num_tokens": 4154858235.0, + "step": 5447 + }, + { + "epoch": 7.432889163901721, + "grad_norm": 0.2355552281367777, + "learning_rate": 2.2376536809662934e-06, + "loss": 0.1559, + "num_tokens": 4155578727.0, + "step": 5448 + }, + { + "epoch": 7.434254744860401, + "grad_norm": 0.23045269188141895, + "learning_rate": 2.2365202347828922e-06, + "loss": 0.1597, + "num_tokens": 4156331529.0, + "step": 5449 + }, + { + "epoch": 7.435620325819082, + "grad_norm": 0.230259647708039, + "learning_rate": 2.235389461957653e-06, + "loss": 0.1693, + "num_tokens": 4157164345.0, + "step": 5450 + }, + { + "epoch": 7.436985906777762, + "grad_norm": 0.22994529688349277, + "learning_rate": 2.2342613628355266e-06, + "loss": 0.1559, + "num_tokens": 4157849679.0, + "step": 5451 + }, + { + "epoch": 7.438351487736443, + "grad_norm": 0.23548115797510027, + "learning_rate": 2.233135937760645e-06, + "loss": 0.169, + "num_tokens": 4158598366.0, + "step": 5452 + }, + { + "epoch": 7.439717068695123, + "grad_norm": 0.22739756076198167, + "learning_rate": 2.2320131870763304e-06, + "loss": 0.1627, + "num_tokens": 4159356430.0, + "step": 5453 + }, + { + "epoch": 7.441082649653804, + "grad_norm": 0.2507301457999663, + "learning_rate": 2.2308931111250824e-06, + "loss": 0.1643, + "num_tokens": 4160090988.0, + "step": 5454 + }, + { + "epoch": 7.442448230612484, + "grad_norm": 0.2179018354305088, + "learning_rate": 2.2297757102485877e-06, + "loss": 0.1629, + "num_tokens": 4160914099.0, + "step": 5455 + }, + { + "epoch": 7.4438138115711645, + "grad_norm": 0.24255547841418856, + "learning_rate": 2.228660984787719e-06, + "loss": 0.1591, + "num_tokens": 4161742068.0, + "step": 5456 + }, + { + "epoch": 7.4451793925298455, + "grad_norm": 0.22070178889061312, + "learning_rate": 2.227548935082529e-06, + "loss": 0.1648, + "num_tokens": 4162594640.0, + "step": 5457 + }, + { + "epoch": 7.446544973488526, + "grad_norm": 0.21769301296641685, + "learning_rate": 2.2264395614722546e-06, + "loss": 0.1595, + "num_tokens": 4163356647.0, + "step": 5458 + }, + { + "epoch": 7.447910554447207, + "grad_norm": 0.21905754343969122, + "learning_rate": 2.2253328642953213e-06, + "loss": 0.1593, + "num_tokens": 4164146467.0, + "step": 5459 + }, + { + "epoch": 7.449276135405887, + "grad_norm": 0.23417037009819328, + "learning_rate": 2.2242288438893333e-06, + "loss": 0.1645, + "num_tokens": 4164915824.0, + "step": 5460 + }, + { + "epoch": 7.450641716364568, + "grad_norm": 0.2316063881888279, + "learning_rate": 2.2231275005910763e-06, + "loss": 0.1699, + "num_tokens": 4165695668.0, + "step": 5461 + }, + { + "epoch": 7.452007297323248, + "grad_norm": 0.23074039011508662, + "learning_rate": 2.222028834736527e-06, + "loss": 0.1586, + "num_tokens": 4166426520.0, + "step": 5462 + }, + { + "epoch": 7.453372878281929, + "grad_norm": 0.22419746487946282, + "learning_rate": 2.220932846660838e-06, + "loss": 0.1636, + "num_tokens": 4167262428.0, + "step": 5463 + }, + { + "epoch": 7.454738459240609, + "grad_norm": 0.23946512475988727, + "learning_rate": 2.219839536698348e-06, + "loss": 0.1597, + "num_tokens": 4167975944.0, + "step": 5464 + }, + { + "epoch": 7.456104040199289, + "grad_norm": 0.2324745923146015, + "learning_rate": 2.218748905182579e-06, + "loss": 0.1576, + "num_tokens": 4168729649.0, + "step": 5465 + }, + { + "epoch": 7.45746962115797, + "grad_norm": 0.22045528954554758, + "learning_rate": 2.2176609524462354e-06, + "loss": 0.1663, + "num_tokens": 4169513202.0, + "step": 5466 + }, + { + "epoch": 7.45883520211665, + "grad_norm": 0.2277971879354185, + "learning_rate": 2.2165756788212045e-06, + "loss": 0.1496, + "num_tokens": 4170232010.0, + "step": 5467 + }, + { + "epoch": 7.460200783075331, + "grad_norm": 0.23467645391967995, + "learning_rate": 2.215493084638556e-06, + "loss": 0.17, + "num_tokens": 4170953220.0, + "step": 5468 + }, + { + "epoch": 7.461566364034011, + "grad_norm": 0.2357110112494784, + "learning_rate": 2.214413170228544e-06, + "loss": 0.1687, + "num_tokens": 4171707166.0, + "step": 5469 + }, + { + "epoch": 7.462931944992692, + "grad_norm": 0.2257504394353422, + "learning_rate": 2.2133359359206e-06, + "loss": 0.1583, + "num_tokens": 4172493230.0, + "step": 5470 + }, + { + "epoch": 7.464297525951372, + "grad_norm": 0.2300022981337233, + "learning_rate": 2.212261382043346e-06, + "loss": 0.1666, + "num_tokens": 4173224197.0, + "step": 5471 + }, + { + "epoch": 7.465663106910053, + "grad_norm": 0.22555337031371314, + "learning_rate": 2.2111895089245786e-06, + "loss": 0.1633, + "num_tokens": 4174001840.0, + "step": 5472 + }, + { + "epoch": 7.4670286878687335, + "grad_norm": 0.23069533129575095, + "learning_rate": 2.210120316891281e-06, + "loss": 0.1628, + "num_tokens": 4174752645.0, + "step": 5473 + }, + { + "epoch": 7.468394268827414, + "grad_norm": 0.24439364481760778, + "learning_rate": 2.2090538062696175e-06, + "loss": 0.1586, + "num_tokens": 4175423825.0, + "step": 5474 + }, + { + "epoch": 7.469759849786095, + "grad_norm": 0.2271174151854004, + "learning_rate": 2.2079899773849352e-06, + "loss": 0.1708, + "num_tokens": 4176225836.0, + "step": 5475 + }, + { + "epoch": 7.471125430744775, + "grad_norm": 0.23298841467470516, + "learning_rate": 2.2069288305617596e-06, + "loss": 0.1701, + "num_tokens": 4176997683.0, + "step": 5476 + }, + { + "epoch": 7.472491011703456, + "grad_norm": 0.22525121116689123, + "learning_rate": 2.2058703661238034e-06, + "loss": 0.1669, + "num_tokens": 4177852628.0, + "step": 5477 + }, + { + "epoch": 7.473856592662136, + "grad_norm": 0.24545973347416827, + "learning_rate": 2.2048145843939567e-06, + "loss": 0.159, + "num_tokens": 4178509634.0, + "step": 5478 + }, + { + "epoch": 7.475222173620817, + "grad_norm": 0.23203636477827122, + "learning_rate": 2.203761485694293e-06, + "loss": 0.1596, + "num_tokens": 4179195843.0, + "step": 5479 + }, + { + "epoch": 7.476587754579497, + "grad_norm": 0.2284662364301498, + "learning_rate": 2.202711070346068e-06, + "loss": 0.1752, + "num_tokens": 4180004847.0, + "step": 5480 + }, + { + "epoch": 7.477953335538178, + "grad_norm": 0.2340162880730845, + "learning_rate": 2.201663338669716e-06, + "loss": 0.157, + "num_tokens": 4180756201.0, + "step": 5481 + }, + { + "epoch": 7.479318916496858, + "grad_norm": 0.248499207811264, + "learning_rate": 2.2006182909848573e-06, + "loss": 0.1678, + "num_tokens": 4181439582.0, + "step": 5482 + }, + { + "epoch": 7.480684497455538, + "grad_norm": 0.23836672123451974, + "learning_rate": 2.1995759276102886e-06, + "loss": 0.1594, + "num_tokens": 4182130928.0, + "step": 5483 + }, + { + "epoch": 7.482050078414219, + "grad_norm": 0.2269814692514156, + "learning_rate": 2.19853624886399e-06, + "loss": 0.1578, + "num_tokens": 4182890035.0, + "step": 5484 + }, + { + "epoch": 7.483415659372899, + "grad_norm": 0.21827536438608663, + "learning_rate": 2.197499255063123e-06, + "loss": 0.1625, + "num_tokens": 4183683743.0, + "step": 5485 + }, + { + "epoch": 7.48478124033158, + "grad_norm": 0.23530599084127618, + "learning_rate": 2.19646494652403e-06, + "loss": 0.1649, + "num_tokens": 4184467586.0, + "step": 5486 + }, + { + "epoch": 7.48614682129026, + "grad_norm": 0.23156137836294632, + "learning_rate": 2.195433323562233e-06, + "loss": 0.1675, + "num_tokens": 4185241141.0, + "step": 5487 + }, + { + "epoch": 7.487512402248941, + "grad_norm": 0.22903587756903418, + "learning_rate": 2.194404386492434e-06, + "loss": 0.1577, + "num_tokens": 4186021640.0, + "step": 5488 + }, + { + "epoch": 7.4888779832076215, + "grad_norm": 0.22760272992787886, + "learning_rate": 2.193378135628521e-06, + "loss": 0.1507, + "num_tokens": 4186717049.0, + "step": 5489 + }, + { + "epoch": 7.4902435641663025, + "grad_norm": 0.2202943005927957, + "learning_rate": 2.192354571283555e-06, + "loss": 0.1755, + "num_tokens": 4187583237.0, + "step": 5490 + }, + { + "epoch": 7.491609145124983, + "grad_norm": 0.2224772101245489, + "learning_rate": 2.1913336937697838e-06, + "loss": 0.16, + "num_tokens": 4188327879.0, + "step": 5491 + }, + { + "epoch": 7.492974726083663, + "grad_norm": 0.2437925467704619, + "learning_rate": 2.190315503398632e-06, + "loss": 0.1667, + "num_tokens": 4189066775.0, + "step": 5492 + }, + { + "epoch": 7.494340307042344, + "grad_norm": 0.2173876376465795, + "learning_rate": 2.189300000480704e-06, + "loss": 0.1624, + "num_tokens": 4189906569.0, + "step": 5493 + }, + { + "epoch": 7.495705888001024, + "grad_norm": 0.3046790197215832, + "learning_rate": 2.188287185325786e-06, + "loss": 0.1583, + "num_tokens": 4190611056.0, + "step": 5494 + }, + { + "epoch": 7.497071468959705, + "grad_norm": 0.24047908728019865, + "learning_rate": 2.1872770582428463e-06, + "loss": 0.1651, + "num_tokens": 4191347172.0, + "step": 5495 + }, + { + "epoch": 7.498437049918385, + "grad_norm": 0.23080118845600903, + "learning_rate": 2.1862696195400302e-06, + "loss": 0.1695, + "num_tokens": 4192128320.0, + "step": 5496 + }, + { + "epoch": 7.499802630877066, + "grad_norm": 0.24549930220224558, + "learning_rate": 2.1852648695246603e-06, + "loss": 0.1617, + "num_tokens": 4192895087.0, + "step": 5497 + }, + { + "epoch": 7.501168211835746, + "grad_norm": 0.23521297534839372, + "learning_rate": 2.1842628085032474e-06, + "loss": 0.1719, + "num_tokens": 4193620655.0, + "step": 5498 + }, + { + "epoch": 7.502533792794427, + "grad_norm": 0.21629831405809177, + "learning_rate": 2.1832634367814733e-06, + "loss": 0.1673, + "num_tokens": 4194457929.0, + "step": 5499 + }, + { + "epoch": 7.503899373753107, + "grad_norm": 0.23899283956091946, + "learning_rate": 2.1822667546642053e-06, + "loss": 0.1654, + "num_tokens": 4195198127.0, + "step": 5500 + }, + { + "epoch": 7.505264954711787, + "grad_norm": 0.21519958918771503, + "learning_rate": 2.181272762455486e-06, + "loss": 0.1683, + "num_tokens": 4195999689.0, + "step": 5501 + }, + { + "epoch": 7.506630535670468, + "grad_norm": 0.2289392335814238, + "learning_rate": 2.1802814604585415e-06, + "loss": 0.1629, + "num_tokens": 4196746598.0, + "step": 5502 + }, + { + "epoch": 7.5079961166291485, + "grad_norm": 0.24934483054704798, + "learning_rate": 2.179292848975772e-06, + "loss": 0.1705, + "num_tokens": 4197526887.0, + "step": 5503 + }, + { + "epoch": 7.5093616975878295, + "grad_norm": 0.23114527158815684, + "learning_rate": 2.178306928308763e-06, + "loss": 0.1587, + "num_tokens": 4198238171.0, + "step": 5504 + }, + { + "epoch": 7.51072727854651, + "grad_norm": 0.2203592199464207, + "learning_rate": 2.1773236987582754e-06, + "loss": 0.1622, + "num_tokens": 4199013631.0, + "step": 5505 + }, + { + "epoch": 7.512092859505191, + "grad_norm": 0.22853439677623752, + "learning_rate": 2.17634316062425e-06, + "loss": 0.1564, + "num_tokens": 4199743217.0, + "step": 5506 + }, + { + "epoch": 7.513458440463871, + "grad_norm": 0.25059932581261285, + "learning_rate": 2.1753653142058046e-06, + "loss": 0.1578, + "num_tokens": 4200372510.0, + "step": 5507 + }, + { + "epoch": 7.514824021422552, + "grad_norm": 0.2258713299844882, + "learning_rate": 2.1743901598012397e-06, + "loss": 0.1545, + "num_tokens": 4201109315.0, + "step": 5508 + }, + { + "epoch": 7.516189602381232, + "grad_norm": 0.23002230918610533, + "learning_rate": 2.173417697708033e-06, + "loss": 0.1617, + "num_tokens": 4201822007.0, + "step": 5509 + }, + { + "epoch": 7.517555183339912, + "grad_norm": 0.2183740863375253, + "learning_rate": 2.172447928222841e-06, + "loss": 0.1631, + "num_tokens": 4202609220.0, + "step": 5510 + }, + { + "epoch": 7.518920764298593, + "grad_norm": 0.2198181518294382, + "learning_rate": 2.1714808516414957e-06, + "loss": 0.1703, + "num_tokens": 4203406508.0, + "step": 5511 + }, + { + "epoch": 7.520286345257273, + "grad_norm": 0.22738582248690328, + "learning_rate": 2.1705164682590128e-06, + "loss": 0.1604, + "num_tokens": 4204161855.0, + "step": 5512 + }, + { + "epoch": 7.521651926215954, + "grad_norm": 0.2256715125099633, + "learning_rate": 2.169554778369582e-06, + "loss": 0.1677, + "num_tokens": 4204985656.0, + "step": 5513 + }, + { + "epoch": 7.523017507174634, + "grad_norm": 0.23164497230222347, + "learning_rate": 2.1685957822665753e-06, + "loss": 0.1647, + "num_tokens": 4205738691.0, + "step": 5514 + }, + { + "epoch": 7.524383088133315, + "grad_norm": 0.24011936187132654, + "learning_rate": 2.1676394802425393e-06, + "loss": 0.1622, + "num_tokens": 4206429060.0, + "step": 5515 + }, + { + "epoch": 7.525748669091995, + "grad_norm": 0.2338030440863196, + "learning_rate": 2.166685872589201e-06, + "loss": 0.1648, + "num_tokens": 4207210208.0, + "step": 5516 + }, + { + "epoch": 7.527114250050676, + "grad_norm": 0.22972510343289876, + "learning_rate": 2.1657349595974646e-06, + "loss": 0.1656, + "num_tokens": 4207975644.0, + "step": 5517 + }, + { + "epoch": 7.528479831009356, + "grad_norm": 0.2617730148378994, + "learning_rate": 2.164786741557413e-06, + "loss": 0.1569, + "num_tokens": 4208678975.0, + "step": 5518 + }, + { + "epoch": 7.5298454119680365, + "grad_norm": 0.23046668257373867, + "learning_rate": 2.1638412187583056e-06, + "loss": 0.1562, + "num_tokens": 4209415556.0, + "step": 5519 + }, + { + "epoch": 7.5312109929267175, + "grad_norm": 0.24650723150960943, + "learning_rate": 2.1628983914885803e-06, + "loss": 0.1649, + "num_tokens": 4210201705.0, + "step": 5520 + }, + { + "epoch": 7.532576573885398, + "grad_norm": 0.22596135246867338, + "learning_rate": 2.1619582600358545e-06, + "loss": 0.1684, + "num_tokens": 4210976350.0, + "step": 5521 + }, + { + "epoch": 7.533942154844079, + "grad_norm": 0.22159350095386565, + "learning_rate": 2.1610208246869207e-06, + "loss": 0.1664, + "num_tokens": 4211788177.0, + "step": 5522 + }, + { + "epoch": 7.535307735802759, + "grad_norm": 0.22589921602042196, + "learning_rate": 2.1600860857277475e-06, + "loss": 0.1555, + "num_tokens": 4212533016.0, + "step": 5523 + }, + { + "epoch": 7.53667331676144, + "grad_norm": 0.24237008658790982, + "learning_rate": 2.1591540434434877e-06, + "loss": 0.162, + "num_tokens": 4213246739.0, + "step": 5524 + }, + { + "epoch": 7.53803889772012, + "grad_norm": 0.23985502586362126, + "learning_rate": 2.158224698118464e-06, + "loss": 0.1622, + "num_tokens": 4213917748.0, + "step": 5525 + }, + { + "epoch": 7.539404478678801, + "grad_norm": 0.22047689494018607, + "learning_rate": 2.1572980500361793e-06, + "loss": 0.1653, + "num_tokens": 4214723890.0, + "step": 5526 + }, + { + "epoch": 7.540770059637481, + "grad_norm": 0.22773861956241115, + "learning_rate": 2.1563740994793156e-06, + "loss": 0.1632, + "num_tokens": 4215514935.0, + "step": 5527 + }, + { + "epoch": 7.542135640596161, + "grad_norm": 0.2322147621156714, + "learning_rate": 2.1554528467297297e-06, + "loss": 0.166, + "num_tokens": 4216296505.0, + "step": 5528 + }, + { + "epoch": 7.543501221554842, + "grad_norm": 0.2255593958272912, + "learning_rate": 2.1545342920684544e-06, + "loss": 0.1674, + "num_tokens": 4217112885.0, + "step": 5529 + }, + { + "epoch": 7.544866802513522, + "grad_norm": 0.23378911465010338, + "learning_rate": 2.1536184357757023e-06, + "loss": 0.1637, + "num_tokens": 4217865474.0, + "step": 5530 + }, + { + "epoch": 7.546232383472203, + "grad_norm": 0.2267790415453801, + "learning_rate": 2.152705278130862e-06, + "loss": 0.1622, + "num_tokens": 4218624778.0, + "step": 5531 + }, + { + "epoch": 7.547597964430883, + "grad_norm": 0.28671473966655814, + "learning_rate": 2.1517948194124964e-06, + "loss": 0.1582, + "num_tokens": 4219373320.0, + "step": 5532 + }, + { + "epoch": 7.548963545389564, + "grad_norm": 0.22445377573573266, + "learning_rate": 2.1508870598983494e-06, + "loss": 0.1618, + "num_tokens": 4220088313.0, + "step": 5533 + }, + { + "epoch": 7.550329126348244, + "grad_norm": 0.2273240052272407, + "learning_rate": 2.149981999865337e-06, + "loss": 0.1632, + "num_tokens": 4220917675.0, + "step": 5534 + }, + { + "epoch": 7.551694707306925, + "grad_norm": 0.21645296814240506, + "learning_rate": 2.149079639589555e-06, + "loss": 0.1566, + "num_tokens": 4221707119.0, + "step": 5535 + }, + { + "epoch": 7.5530602882656055, + "grad_norm": 0.23556768009320467, + "learning_rate": 2.148179979346274e-06, + "loss": 0.1599, + "num_tokens": 4222388781.0, + "step": 5536 + }, + { + "epoch": 7.554425869224286, + "grad_norm": 0.22983303940123676, + "learning_rate": 2.1472830194099433e-06, + "loss": 0.164, + "num_tokens": 4223162294.0, + "step": 5537 + }, + { + "epoch": 7.555791450182967, + "grad_norm": 0.23254554436580332, + "learning_rate": 2.146388760054183e-06, + "loss": 0.1624, + "num_tokens": 4223915655.0, + "step": 5538 + }, + { + "epoch": 7.557157031141647, + "grad_norm": 0.2624553968508459, + "learning_rate": 2.145497201551795e-06, + "loss": 0.1692, + "num_tokens": 4224703165.0, + "step": 5539 + }, + { + "epoch": 7.558522612100328, + "grad_norm": 0.22632991156559723, + "learning_rate": 2.1446083441747556e-06, + "loss": 0.1555, + "num_tokens": 4225431541.0, + "step": 5540 + }, + { + "epoch": 7.559888193059008, + "grad_norm": 0.23957595766670822, + "learning_rate": 2.1437221881942144e-06, + "loss": 0.161, + "num_tokens": 4226139751.0, + "step": 5541 + }, + { + "epoch": 7.561253774017689, + "grad_norm": 0.21881229495062984, + "learning_rate": 2.1428387338805016e-06, + "loss": 0.1584, + "num_tokens": 4226983884.0, + "step": 5542 + }, + { + "epoch": 7.562619354976369, + "grad_norm": 0.22715462051204646, + "learning_rate": 2.1419579815031206e-06, + "loss": 0.1594, + "num_tokens": 4227715997.0, + "step": 5543 + }, + { + "epoch": 7.56398493593505, + "grad_norm": 0.22813273952291502, + "learning_rate": 2.14107993133075e-06, + "loss": 0.1619, + "num_tokens": 4228496400.0, + "step": 5544 + }, + { + "epoch": 7.56535051689373, + "grad_norm": 0.2396291826799164, + "learning_rate": 2.140204583631245e-06, + "loss": 0.1637, + "num_tokens": 4229234000.0, + "step": 5545 + }, + { + "epoch": 7.56671609785241, + "grad_norm": 0.22194638813440393, + "learning_rate": 2.139331938671636e-06, + "loss": 0.1603, + "num_tokens": 4230028081.0, + "step": 5546 + }, + { + "epoch": 7.568081678811091, + "grad_norm": 0.23081771234128629, + "learning_rate": 2.13846199671813e-06, + "loss": 0.1606, + "num_tokens": 4230778419.0, + "step": 5547 + }, + { + "epoch": 7.569447259769771, + "grad_norm": 0.23340666750307787, + "learning_rate": 2.1375947580361085e-06, + "loss": 0.1613, + "num_tokens": 4231480697.0, + "step": 5548 + }, + { + "epoch": 7.570812840728452, + "grad_norm": 0.2379337012504938, + "learning_rate": 2.1367302228901282e-06, + "loss": 0.166, + "num_tokens": 4232197174.0, + "step": 5549 + }, + { + "epoch": 7.572178421687132, + "grad_norm": 0.2379019763119666, + "learning_rate": 2.135868391543921e-06, + "loss": 0.1625, + "num_tokens": 4232922912.0, + "step": 5550 + }, + { + "epoch": 7.573544002645813, + "grad_norm": 0.250811060839321, + "learning_rate": 2.135009264260394e-06, + "loss": 0.157, + "num_tokens": 4233602241.0, + "step": 5551 + }, + { + "epoch": 7.5749095836044935, + "grad_norm": 0.2359871715732713, + "learning_rate": 2.134152841301632e-06, + "loss": 0.1642, + "num_tokens": 4234287631.0, + "step": 5552 + }, + { + "epoch": 7.5762751645631745, + "grad_norm": 0.21733438065396665, + "learning_rate": 2.1332991229288898e-06, + "loss": 0.1613, + "num_tokens": 4235094778.0, + "step": 5553 + }, + { + "epoch": 7.577640745521855, + "grad_norm": 0.2347293452009135, + "learning_rate": 2.132448109402601e-06, + "loss": 0.1719, + "num_tokens": 4235893463.0, + "step": 5554 + }, + { + "epoch": 7.579006326480535, + "grad_norm": 0.23525492918761154, + "learning_rate": 2.1315998009823725e-06, + "loss": 0.1701, + "num_tokens": 4236664179.0, + "step": 5555 + }, + { + "epoch": 7.580371907439216, + "grad_norm": 0.21905559813879413, + "learning_rate": 2.1307541979269884e-06, + "loss": 0.1599, + "num_tokens": 4237448857.0, + "step": 5556 + }, + { + "epoch": 7.581737488397896, + "grad_norm": 0.22292081240207917, + "learning_rate": 2.1299113004944034e-06, + "loss": 0.1679, + "num_tokens": 4238252299.0, + "step": 5557 + }, + { + "epoch": 7.583103069356577, + "grad_norm": 0.23009664532887078, + "learning_rate": 2.1290711089417497e-06, + "loss": 0.1588, + "num_tokens": 4238974110.0, + "step": 5558 + }, + { + "epoch": 7.584468650315257, + "grad_norm": 0.2301347477449659, + "learning_rate": 2.1282336235253327e-06, + "loss": 0.1585, + "num_tokens": 4239676460.0, + "step": 5559 + }, + { + "epoch": 7.585834231273938, + "grad_norm": 0.22782764762904362, + "learning_rate": 2.1273988445006344e-06, + "loss": 0.1614, + "num_tokens": 4240442537.0, + "step": 5560 + }, + { + "epoch": 7.587199812232618, + "grad_norm": 0.2385555706511238, + "learning_rate": 2.1265667721223097e-06, + "loss": 0.1635, + "num_tokens": 4241219742.0, + "step": 5561 + }, + { + "epoch": 7.588565393191299, + "grad_norm": 0.22426871921556685, + "learning_rate": 2.1257374066441857e-06, + "loss": 0.1534, + "num_tokens": 4241935772.0, + "step": 5562 + }, + { + "epoch": 7.589930974149979, + "grad_norm": 0.22777939946231038, + "learning_rate": 2.1249107483192677e-06, + "loss": 0.1586, + "num_tokens": 4242679511.0, + "step": 5563 + }, + { + "epoch": 7.591296555108659, + "grad_norm": 0.23555130565751028, + "learning_rate": 2.124086797399734e-06, + "loss": 0.1638, + "num_tokens": 4243388949.0, + "step": 5564 + }, + { + "epoch": 7.59266213606734, + "grad_norm": 0.24221875650731395, + "learning_rate": 2.1232655541369337e-06, + "loss": 0.1626, + "num_tokens": 4244127946.0, + "step": 5565 + }, + { + "epoch": 7.59402771702602, + "grad_norm": 0.24108301122319606, + "learning_rate": 2.122447018781395e-06, + "loss": 0.1603, + "num_tokens": 4244860431.0, + "step": 5566 + }, + { + "epoch": 7.595393297984701, + "grad_norm": 0.22859831500280756, + "learning_rate": 2.121631191582817e-06, + "loss": 0.1587, + "num_tokens": 4245602666.0, + "step": 5567 + }, + { + "epoch": 7.5967588789433815, + "grad_norm": 0.23096959752641216, + "learning_rate": 2.1208180727900717e-06, + "loss": 0.1639, + "num_tokens": 4246362040.0, + "step": 5568 + }, + { + "epoch": 7.5981244599020625, + "grad_norm": 0.21843940024098793, + "learning_rate": 2.1200076626512088e-06, + "loss": 0.1582, + "num_tokens": 4247185471.0, + "step": 5569 + }, + { + "epoch": 7.599490040860743, + "grad_norm": 0.24927281037017776, + "learning_rate": 2.119199961413448e-06, + "loss": 0.1594, + "num_tokens": 4247869239.0, + "step": 5570 + }, + { + "epoch": 7.600855621819424, + "grad_norm": 0.22776022585813438, + "learning_rate": 2.1183949693231843e-06, + "loss": 0.1649, + "num_tokens": 4248607679.0, + "step": 5571 + }, + { + "epoch": 7.602221202778104, + "grad_norm": 0.22870548308964725, + "learning_rate": 2.117592686625986e-06, + "loss": 0.1614, + "num_tokens": 4249340741.0, + "step": 5572 + }, + { + "epoch": 7.603586783736784, + "grad_norm": 0.22683098331234824, + "learning_rate": 2.1167931135665944e-06, + "loss": 0.1633, + "num_tokens": 4250090826.0, + "step": 5573 + }, + { + "epoch": 7.604952364695465, + "grad_norm": 0.2231332807634199, + "learning_rate": 2.1159962503889252e-06, + "loss": 0.1683, + "num_tokens": 4250919262.0, + "step": 5574 + }, + { + "epoch": 7.606317945654145, + "grad_norm": 0.2155636440576396, + "learning_rate": 2.1152020973360654e-06, + "loss": 0.1666, + "num_tokens": 4251735124.0, + "step": 5575 + }, + { + "epoch": 7.607683526612826, + "grad_norm": 0.22977739752236445, + "learning_rate": 2.1144106546502794e-06, + "loss": 0.1558, + "num_tokens": 4252507199.0, + "step": 5576 + }, + { + "epoch": 7.609049107571506, + "grad_norm": 0.22616557731485556, + "learning_rate": 2.113621922573e-06, + "loss": 0.1615, + "num_tokens": 4253297497.0, + "step": 5577 + }, + { + "epoch": 7.610414688530187, + "grad_norm": 0.2286926385382071, + "learning_rate": 2.1128359013448348e-06, + "loss": 0.1673, + "num_tokens": 4254082039.0, + "step": 5578 + }, + { + "epoch": 7.611780269488867, + "grad_norm": 0.22396008303396614, + "learning_rate": 2.112052591205567e-06, + "loss": 0.1653, + "num_tokens": 4254872067.0, + "step": 5579 + }, + { + "epoch": 7.613145850447548, + "grad_norm": 0.2314282837683109, + "learning_rate": 2.1112719923941486e-06, + "loss": 0.1638, + "num_tokens": 4255625883.0, + "step": 5580 + }, + { + "epoch": 7.614511431406228, + "grad_norm": 0.22014724717980177, + "learning_rate": 2.1104941051487077e-06, + "loss": 0.1614, + "num_tokens": 4256397935.0, + "step": 5581 + }, + { + "epoch": 7.615877012364908, + "grad_norm": 0.23160068243099677, + "learning_rate": 2.109718929706543e-06, + "loss": 0.1599, + "num_tokens": 4257103990.0, + "step": 5582 + }, + { + "epoch": 7.617242593323589, + "grad_norm": 0.22705993999122392, + "learning_rate": 2.1089464663041276e-06, + "loss": 0.1624, + "num_tokens": 4257898798.0, + "step": 5583 + }, + { + "epoch": 7.6186081742822696, + "grad_norm": 0.21799461545738255, + "learning_rate": 2.108176715177107e-06, + "loss": 0.1534, + "num_tokens": 4258763320.0, + "step": 5584 + }, + { + "epoch": 7.6199737552409506, + "grad_norm": 0.22946450840977553, + "learning_rate": 2.1074096765602986e-06, + "loss": 0.1647, + "num_tokens": 4259544492.0, + "step": 5585 + }, + { + "epoch": 7.621339336199631, + "grad_norm": 0.23161275476133786, + "learning_rate": 2.1066453506876915e-06, + "loss": 0.1651, + "num_tokens": 4260319941.0, + "step": 5586 + }, + { + "epoch": 7.622704917158312, + "grad_norm": 0.23188470536162867, + "learning_rate": 2.10588373779245e-06, + "loss": 0.1628, + "num_tokens": 4261094297.0, + "step": 5587 + }, + { + "epoch": 7.624070498116992, + "grad_norm": 0.22924320418228888, + "learning_rate": 2.1051248381069088e-06, + "loss": 0.1604, + "num_tokens": 4261875853.0, + "step": 5588 + }, + { + "epoch": 7.625436079075673, + "grad_norm": 0.22235161872365514, + "learning_rate": 2.104368651862575e-06, + "loss": 0.1595, + "num_tokens": 4262633431.0, + "step": 5589 + }, + { + "epoch": 7.626801660034353, + "grad_norm": 0.24294410489781848, + "learning_rate": 2.1036151792901284e-06, + "loss": 0.1667, + "num_tokens": 4263366019.0, + "step": 5590 + }, + { + "epoch": 7.628167240993033, + "grad_norm": 0.22445717660867656, + "learning_rate": 2.102864420619421e-06, + "loss": 0.1602, + "num_tokens": 4264138548.0, + "step": 5591 + }, + { + "epoch": 7.629532821951714, + "grad_norm": 0.226305430420918, + "learning_rate": 2.102116376079477e-06, + "loss": 0.1582, + "num_tokens": 4264915910.0, + "step": 5592 + }, + { + "epoch": 7.630898402910394, + "grad_norm": 0.242667506040787, + "learning_rate": 2.1013710458984925e-06, + "loss": 0.1626, + "num_tokens": 4265694151.0, + "step": 5593 + }, + { + "epoch": 7.632263983869075, + "grad_norm": 0.24569131691511192, + "learning_rate": 2.100628430303834e-06, + "loss": 0.1592, + "num_tokens": 4266397777.0, + "step": 5594 + }, + { + "epoch": 7.633629564827755, + "grad_norm": 0.23534177657419672, + "learning_rate": 2.0998885295220427e-06, + "loss": 0.1669, + "num_tokens": 4267159364.0, + "step": 5595 + }, + { + "epoch": 7.634995145786436, + "grad_norm": 0.23167649934848064, + "learning_rate": 2.0991513437788306e-06, + "loss": 0.1687, + "num_tokens": 4267961966.0, + "step": 5596 + }, + { + "epoch": 7.636360726745116, + "grad_norm": 0.22933977655395796, + "learning_rate": 2.0984168732990795e-06, + "loss": 0.1653, + "num_tokens": 4268702128.0, + "step": 5597 + }, + { + "epoch": 7.637726307703797, + "grad_norm": 0.23567798360752543, + "learning_rate": 2.097685118306846e-06, + "loss": 0.1681, + "num_tokens": 4269436421.0, + "step": 5598 + }, + { + "epoch": 7.6390918886624775, + "grad_norm": 0.2201935687639933, + "learning_rate": 2.0969560790253572e-06, + "loss": 0.171, + "num_tokens": 4270255269.0, + "step": 5599 + }, + { + "epoch": 7.640457469621158, + "grad_norm": 0.23402958780088984, + "learning_rate": 2.0962297556770093e-06, + "loss": 0.165, + "num_tokens": 4271045928.0, + "step": 5600 + }, + { + "epoch": 7.641823050579839, + "grad_norm": 0.2209829480382855, + "learning_rate": 2.0955061484833737e-06, + "loss": 0.1606, + "num_tokens": 4271813367.0, + "step": 5601 + }, + { + "epoch": 7.643188631538519, + "grad_norm": 0.2123923221330911, + "learning_rate": 2.094785257665193e-06, + "loss": 0.1562, + "num_tokens": 4272611906.0, + "step": 5602 + }, + { + "epoch": 7.6445542124972, + "grad_norm": 0.20934295094152916, + "learning_rate": 2.0940670834423758e-06, + "loss": 0.1589, + "num_tokens": 4273415167.0, + "step": 5603 + }, + { + "epoch": 7.64591979345588, + "grad_norm": 0.22745727803756655, + "learning_rate": 2.09335162603401e-06, + "loss": 0.1639, + "num_tokens": 4274183571.0, + "step": 5604 + }, + { + "epoch": 7.647285374414561, + "grad_norm": 0.23438347963361575, + "learning_rate": 2.0926388856583475e-06, + "loss": 0.1643, + "num_tokens": 4274919812.0, + "step": 5605 + }, + { + "epoch": 7.648650955373241, + "grad_norm": 0.22571750691377118, + "learning_rate": 2.0919288625328177e-06, + "loss": 0.1652, + "num_tokens": 4275708311.0, + "step": 5606 + }, + { + "epoch": 7.650016536331922, + "grad_norm": 0.23671008708624533, + "learning_rate": 2.091221556874015e-06, + "loss": 0.157, + "num_tokens": 4276429570.0, + "step": 5607 + }, + { + "epoch": 7.651382117290602, + "grad_norm": 0.23139679546701525, + "learning_rate": 2.0905169688977095e-06, + "loss": 0.1587, + "num_tokens": 4277148659.0, + "step": 5608 + }, + { + "epoch": 7.652747698249282, + "grad_norm": 0.21894467829856853, + "learning_rate": 2.0898150988188393e-06, + "loss": 0.1753, + "num_tokens": 4278018018.0, + "step": 5609 + }, + { + "epoch": 7.654113279207963, + "grad_norm": 0.6557668962701796, + "learning_rate": 2.0891159468515147e-06, + "loss": 0.1559, + "num_tokens": 4278707498.0, + "step": 5610 + }, + { + "epoch": 7.655478860166643, + "grad_norm": 0.23883127184638833, + "learning_rate": 2.0884195132090187e-06, + "loss": 0.1596, + "num_tokens": 4279407365.0, + "step": 5611 + }, + { + "epoch": 7.656844441125324, + "grad_norm": 0.2402950305737766, + "learning_rate": 2.0877257981037998e-06, + "loss": 0.1652, + "num_tokens": 4280197335.0, + "step": 5612 + }, + { + "epoch": 7.658210022084004, + "grad_norm": 0.2360624182193832, + "learning_rate": 2.0870348017474833e-06, + "loss": 0.1624, + "num_tokens": 4280878851.0, + "step": 5613 + }, + { + "epoch": 7.659575603042685, + "grad_norm": 0.22804426428487826, + "learning_rate": 2.086346524350861e-06, + "loss": 0.1688, + "num_tokens": 4281653166.0, + "step": 5614 + }, + { + "epoch": 7.6609411840013655, + "grad_norm": 0.5434807770038477, + "learning_rate": 2.0856609661238958e-06, + "loss": 0.1586, + "num_tokens": 4282414805.0, + "step": 5615 + }, + { + "epoch": 7.6623067649600465, + "grad_norm": 0.2218984823156082, + "learning_rate": 2.084978127275724e-06, + "loss": 0.1619, + "num_tokens": 4283220922.0, + "step": 5616 + }, + { + "epoch": 7.663672345918727, + "grad_norm": 0.22752028443454805, + "learning_rate": 2.0842980080146474e-06, + "loss": 0.1552, + "num_tokens": 4283942755.0, + "step": 5617 + }, + { + "epoch": 7.665037926877407, + "grad_norm": 0.2209312197164226, + "learning_rate": 2.0836206085481434e-06, + "loss": 0.1693, + "num_tokens": 4284729803.0, + "step": 5618 + }, + { + "epoch": 7.666403507836088, + "grad_norm": 0.2264874711019726, + "learning_rate": 2.0829459290828547e-06, + "loss": 0.1595, + "num_tokens": 4285479439.0, + "step": 5619 + }, + { + "epoch": 7.667769088794768, + "grad_norm": 0.23524366562930718, + "learning_rate": 2.0822739698245996e-06, + "loss": 0.1602, + "num_tokens": 4286246199.0, + "step": 5620 + }, + { + "epoch": 7.669134669753449, + "grad_norm": 0.2326030733628879, + "learning_rate": 2.0816047309783613e-06, + "loss": 0.1625, + "num_tokens": 4287012966.0, + "step": 5621 + }, + { + "epoch": 7.670500250712129, + "grad_norm": 0.22843521283782578, + "learning_rate": 2.080938212748298e-06, + "loss": 0.1712, + "num_tokens": 4287795106.0, + "step": 5622 + }, + { + "epoch": 7.67186583167081, + "grad_norm": 0.23204770479621506, + "learning_rate": 2.080274415337734e-06, + "loss": 0.168, + "num_tokens": 4288575740.0, + "step": 5623 + }, + { + "epoch": 7.67323141262949, + "grad_norm": 0.21909627774246884, + "learning_rate": 2.0796133389491642e-06, + "loss": 0.1572, + "num_tokens": 4289366514.0, + "step": 5624 + }, + { + "epoch": 7.674596993588171, + "grad_norm": 0.26196489866359984, + "learning_rate": 2.078954983784257e-06, + "loss": 0.166, + "num_tokens": 4290145448.0, + "step": 5625 + }, + { + "epoch": 7.675962574546851, + "grad_norm": 0.22778695260810575, + "learning_rate": 2.0782993500438467e-06, + "loss": 0.1645, + "num_tokens": 4290939396.0, + "step": 5626 + }, + { + "epoch": 7.677328155505531, + "grad_norm": 0.21393939883106536, + "learning_rate": 2.077646437927938e-06, + "loss": 0.1613, + "num_tokens": 4291772439.0, + "step": 5627 + }, + { + "epoch": 7.678693736464212, + "grad_norm": 0.24923267836755772, + "learning_rate": 2.076996247635707e-06, + "loss": 0.1671, + "num_tokens": 4292457538.0, + "step": 5628 + }, + { + "epoch": 7.680059317422892, + "grad_norm": 0.2316628065276969, + "learning_rate": 2.0763487793654985e-06, + "loss": 0.1657, + "num_tokens": 4293227394.0, + "step": 5629 + }, + { + "epoch": 7.681424898381573, + "grad_norm": 0.2253695108915374, + "learning_rate": 2.0757040333148256e-06, + "loss": 0.1654, + "num_tokens": 4293985898.0, + "step": 5630 + }, + { + "epoch": 7.6827904793402535, + "grad_norm": 0.2369140313434243, + "learning_rate": 2.0750620096803754e-06, + "loss": 0.1604, + "num_tokens": 4294802281.0, + "step": 5631 + }, + { + "epoch": 7.6841560602989345, + "grad_norm": 0.24863466774122447, + "learning_rate": 2.074422708658e-06, + "loss": 0.164, + "num_tokens": 4295554157.0, + "step": 5632 + }, + { + "epoch": 7.685521641257615, + "grad_norm": 0.26615768876365253, + "learning_rate": 2.0737861304427202e-06, + "loss": 0.1625, + "num_tokens": 4296332249.0, + "step": 5633 + }, + { + "epoch": 7.686887222216296, + "grad_norm": 0.22956231589603956, + "learning_rate": 2.0731522752287333e-06, + "loss": 0.1624, + "num_tokens": 4297064083.0, + "step": 5634 + }, + { + "epoch": 7.688252803174976, + "grad_norm": 0.22163347733383945, + "learning_rate": 2.072521143209397e-06, + "loss": 0.169, + "num_tokens": 4297864897.0, + "step": 5635 + }, + { + "epoch": 7.689618384133656, + "grad_norm": 0.22488991388255175, + "learning_rate": 2.0718927345772433e-06, + "loss": 0.1679, + "num_tokens": 4298641227.0, + "step": 5636 + }, + { + "epoch": 7.690983965092337, + "grad_norm": 0.23150333768130102, + "learning_rate": 2.071267049523973e-06, + "loss": 0.1698, + "num_tokens": 4299419364.0, + "step": 5637 + }, + { + "epoch": 7.692349546051017, + "grad_norm": 0.23845821684385485, + "learning_rate": 2.0706440882404557e-06, + "loss": 0.1692, + "num_tokens": 4300134093.0, + "step": 5638 + }, + { + "epoch": 7.693715127009698, + "grad_norm": 0.2233182935272216, + "learning_rate": 2.0700238509167298e-06, + "loss": 0.1618, + "num_tokens": 4300913438.0, + "step": 5639 + }, + { + "epoch": 7.695080707968378, + "grad_norm": 0.23469968153697288, + "learning_rate": 2.0694063377420013e-06, + "loss": 0.1555, + "num_tokens": 4301625451.0, + "step": 5640 + }, + { + "epoch": 7.696446288927059, + "grad_norm": 0.23618231856719774, + "learning_rate": 2.0687915489046497e-06, + "loss": 0.1681, + "num_tokens": 4302348341.0, + "step": 5641 + }, + { + "epoch": 7.697811869885739, + "grad_norm": 0.22986242274178742, + "learning_rate": 2.068179484592218e-06, + "loss": 0.1617, + "num_tokens": 4303083122.0, + "step": 5642 + }, + { + "epoch": 7.69917745084442, + "grad_norm": 0.23578369046557582, + "learning_rate": 2.067570144991421e-06, + "loss": 0.1676, + "num_tokens": 4303867621.0, + "step": 5643 + }, + { + "epoch": 7.7005430318031, + "grad_norm": 0.23189934429872586, + "learning_rate": 2.0669635302881423e-06, + "loss": 0.1697, + "num_tokens": 4304628453.0, + "step": 5644 + }, + { + "epoch": 7.70190861276178, + "grad_norm": 0.2344651358793124, + "learning_rate": 2.0663596406674335e-06, + "loss": 0.1676, + "num_tokens": 4305363500.0, + "step": 5645 + }, + { + "epoch": 7.703274193720461, + "grad_norm": 0.23220448229746343, + "learning_rate": 2.0657584763135156e-06, + "loss": 0.1653, + "num_tokens": 4306134091.0, + "step": 5646 + }, + { + "epoch": 7.7046397746791415, + "grad_norm": 0.24080867512847068, + "learning_rate": 2.0651600374097775e-06, + "loss": 0.1676, + "num_tokens": 4306888889.0, + "step": 5647 + }, + { + "epoch": 7.7060053556378225, + "grad_norm": 0.22649088685280952, + "learning_rate": 2.0645643241387765e-06, + "loss": 0.1644, + "num_tokens": 4307682161.0, + "step": 5648 + }, + { + "epoch": 7.707370936596503, + "grad_norm": 0.23168309715116134, + "learning_rate": 2.0639713366822396e-06, + "loss": 0.1643, + "num_tokens": 4308468276.0, + "step": 5649 + }, + { + "epoch": 7.708736517555184, + "grad_norm": 0.2208436645644644, + "learning_rate": 2.063381075221062e-06, + "loss": 0.1543, + "num_tokens": 4309211531.0, + "step": 5650 + }, + { + "epoch": 7.710102098513864, + "grad_norm": 0.24792998449696957, + "learning_rate": 2.062793539935305e-06, + "loss": 0.1639, + "num_tokens": 4309938678.0, + "step": 5651 + }, + { + "epoch": 7.711467679472545, + "grad_norm": 0.22905534391981697, + "learning_rate": 2.062208731004203e-06, + "loss": 0.1736, + "num_tokens": 4310728728.0, + "step": 5652 + }, + { + "epoch": 7.712833260431225, + "grad_norm": 0.2293452171570073, + "learning_rate": 2.061626648606153e-06, + "loss": 0.1652, + "num_tokens": 4311513003.0, + "step": 5653 + }, + { + "epoch": 7.714198841389905, + "grad_norm": 0.2443424620264314, + "learning_rate": 2.0610472929187244e-06, + "loss": 0.1656, + "num_tokens": 4312240353.0, + "step": 5654 + }, + { + "epoch": 7.715564422348586, + "grad_norm": 0.24374680337491908, + "learning_rate": 2.060470664118655e-06, + "loss": 0.1632, + "num_tokens": 4312948725.0, + "step": 5655 + }, + { + "epoch": 7.716930003307266, + "grad_norm": 0.250226850174123, + "learning_rate": 2.0598967623818466e-06, + "loss": 0.1708, + "num_tokens": 4313671782.0, + "step": 5656 + }, + { + "epoch": 7.718295584265947, + "grad_norm": 0.25342983834768384, + "learning_rate": 2.0593255878833733e-06, + "loss": 0.1581, + "num_tokens": 4314409234.0, + "step": 5657 + }, + { + "epoch": 7.719661165224627, + "grad_norm": 0.2556771768555858, + "learning_rate": 2.0587571407974756e-06, + "loss": 0.163, + "num_tokens": 4315219812.0, + "step": 5658 + }, + { + "epoch": 7.721026746183308, + "grad_norm": 0.2270600704174803, + "learning_rate": 2.0581914212975624e-06, + "loss": 0.1644, + "num_tokens": 4316004176.0, + "step": 5659 + }, + { + "epoch": 7.722392327141988, + "grad_norm": 0.22462075497014708, + "learning_rate": 2.057628429556209e-06, + "loss": 0.1619, + "num_tokens": 4316775897.0, + "step": 5660 + }, + { + "epoch": 7.723757908100669, + "grad_norm": 0.24277685830968798, + "learning_rate": 2.0570681657451614e-06, + "loss": 0.1602, + "num_tokens": 4317434547.0, + "step": 5661 + }, + { + "epoch": 7.725123489059349, + "grad_norm": 0.25223063685405184, + "learning_rate": 2.0565106300353297e-06, + "loss": 0.1613, + "num_tokens": 4318145943.0, + "step": 5662 + }, + { + "epoch": 7.7264890700180295, + "grad_norm": 0.2559312241450896, + "learning_rate": 2.055955822596797e-06, + "loss": 0.1675, + "num_tokens": 4318935831.0, + "step": 5663 + }, + { + "epoch": 7.7278546509767105, + "grad_norm": 0.23307853996582323, + "learning_rate": 2.0554037435988074e-06, + "loss": 0.1653, + "num_tokens": 4319712715.0, + "step": 5664 + }, + { + "epoch": 7.729220231935391, + "grad_norm": 0.23633622955554284, + "learning_rate": 2.0548543932097788e-06, + "loss": 0.163, + "num_tokens": 4320443316.0, + "step": 5665 + }, + { + "epoch": 7.730585812894072, + "grad_norm": 0.2455185756758313, + "learning_rate": 2.054307771597294e-06, + "loss": 0.1619, + "num_tokens": 4321094926.0, + "step": 5666 + }, + { + "epoch": 7.731951393852752, + "grad_norm": 0.22575486184759685, + "learning_rate": 2.0537638789281027e-06, + "loss": 0.1607, + "num_tokens": 4321899490.0, + "step": 5667 + }, + { + "epoch": 7.733316974811433, + "grad_norm": 0.23578296058722556, + "learning_rate": 2.0532227153681226e-06, + "loss": 0.1612, + "num_tokens": 4322661455.0, + "step": 5668 + }, + { + "epoch": 7.734682555770113, + "grad_norm": 0.23569997217261338, + "learning_rate": 2.052684281082441e-06, + "loss": 0.1608, + "num_tokens": 4323399130.0, + "step": 5669 + }, + { + "epoch": 7.736048136728794, + "grad_norm": 0.23909288153668423, + "learning_rate": 2.05214857623531e-06, + "loss": 0.1665, + "num_tokens": 4324181042.0, + "step": 5670 + }, + { + "epoch": 7.737413717687474, + "grad_norm": 0.21650033339881938, + "learning_rate": 2.0516156009901496e-06, + "loss": 0.1589, + "num_tokens": 4324963362.0, + "step": 5671 + }, + { + "epoch": 7.738779298646154, + "grad_norm": 0.2185085629754561, + "learning_rate": 2.051085355509548e-06, + "loss": 0.157, + "num_tokens": 4325731492.0, + "step": 5672 + }, + { + "epoch": 7.740144879604835, + "grad_norm": 0.22665077790870772, + "learning_rate": 2.0505578399552596e-06, + "loss": 0.1609, + "num_tokens": 4326579466.0, + "step": 5673 + }, + { + "epoch": 7.741510460563515, + "grad_norm": 0.2420843286434037, + "learning_rate": 2.0500330544882066e-06, + "loss": 0.1626, + "num_tokens": 4327254928.0, + "step": 5674 + }, + { + "epoch": 7.742876041522196, + "grad_norm": 0.21984156823866063, + "learning_rate": 2.0495109992684788e-06, + "loss": 0.1582, + "num_tokens": 4328031796.0, + "step": 5675 + }, + { + "epoch": 7.744241622480876, + "grad_norm": 0.22008861924819703, + "learning_rate": 2.048991674455333e-06, + "loss": 0.1698, + "num_tokens": 4328884460.0, + "step": 5676 + }, + { + "epoch": 7.745607203439557, + "grad_norm": 0.2266049104319786, + "learning_rate": 2.048475080207191e-06, + "loss": 0.1679, + "num_tokens": 4329672937.0, + "step": 5677 + }, + { + "epoch": 7.746972784398237, + "grad_norm": 0.22814706169759635, + "learning_rate": 2.047961216681645e-06, + "loss": 0.1637, + "num_tokens": 4330525777.0, + "step": 5678 + }, + { + "epoch": 7.7483383653569184, + "grad_norm": 0.21482636546633013, + "learning_rate": 2.047450084035452e-06, + "loss": 0.1589, + "num_tokens": 4331371549.0, + "step": 5679 + }, + { + "epoch": 7.749703946315599, + "grad_norm": 0.23924548712841862, + "learning_rate": 2.0469416824245368e-06, + "loss": 0.1635, + "num_tokens": 4332102751.0, + "step": 5680 + }, + { + "epoch": 7.751069527274279, + "grad_norm": 0.23707449878166037, + "learning_rate": 2.0464360120039893e-06, + "loss": 0.1662, + "num_tokens": 4332826462.0, + "step": 5681 + }, + { + "epoch": 7.75243510823296, + "grad_norm": 0.2332907433357398, + "learning_rate": 2.0459330729280693e-06, + "loss": 0.1538, + "num_tokens": 4333593586.0, + "step": 5682 + }, + { + "epoch": 7.75380068919164, + "grad_norm": 0.22227697584953587, + "learning_rate": 2.045432865350201e-06, + "loss": 0.17, + "num_tokens": 4334394964.0, + "step": 5683 + }, + { + "epoch": 7.755166270150321, + "grad_norm": 0.24641471647896343, + "learning_rate": 2.044935389422977e-06, + "loss": 0.1605, + "num_tokens": 4335121580.0, + "step": 5684 + }, + { + "epoch": 7.756531851109001, + "grad_norm": 0.23110830376079727, + "learning_rate": 2.0444406452981537e-06, + "loss": 0.1642, + "num_tokens": 4335891077.0, + "step": 5685 + }, + { + "epoch": 7.757897432067682, + "grad_norm": 0.21981947332053675, + "learning_rate": 2.043948633126657e-06, + "loss": 0.1694, + "num_tokens": 4336732907.0, + "step": 5686 + }, + { + "epoch": 7.759263013026362, + "grad_norm": 0.22728343245644558, + "learning_rate": 2.0434593530585798e-06, + "loss": 0.1613, + "num_tokens": 4337476574.0, + "step": 5687 + }, + { + "epoch": 7.760628593985043, + "grad_norm": 0.2254435308476042, + "learning_rate": 2.0429728052431778e-06, + "loss": 0.1716, + "num_tokens": 4338242953.0, + "step": 5688 + }, + { + "epoch": 7.761994174943723, + "grad_norm": 0.23334919113182714, + "learning_rate": 2.042488989828878e-06, + "loss": 0.1597, + "num_tokens": 4338940363.0, + "step": 5689 + }, + { + "epoch": 7.763359755902403, + "grad_norm": 0.24782873893939325, + "learning_rate": 2.0420079069632698e-06, + "loss": 0.1755, + "num_tokens": 4339717726.0, + "step": 5690 + }, + { + "epoch": 7.764725336861084, + "grad_norm": 0.23448511595156893, + "learning_rate": 2.0415295567931124e-06, + "loss": 0.1577, + "num_tokens": 4340436587.0, + "step": 5691 + }, + { + "epoch": 7.766090917819764, + "grad_norm": 0.22926705753515136, + "learning_rate": 2.041053939464328e-06, + "loss": 0.1656, + "num_tokens": 4341200795.0, + "step": 5692 + }, + { + "epoch": 7.767456498778445, + "grad_norm": 0.2273107269925038, + "learning_rate": 2.040581055122009e-06, + "loss": 0.163, + "num_tokens": 4341984068.0, + "step": 5693 + }, + { + "epoch": 7.7688220797371255, + "grad_norm": 0.22361047194847913, + "learning_rate": 2.040110903910411e-06, + "loss": 0.1687, + "num_tokens": 4342809273.0, + "step": 5694 + }, + { + "epoch": 7.7701876606958065, + "grad_norm": 0.21466018367330225, + "learning_rate": 2.0396434859729546e-06, + "loss": 0.1599, + "num_tokens": 4343622220.0, + "step": 5695 + }, + { + "epoch": 7.771553241654487, + "grad_norm": 0.23164027787933114, + "learning_rate": 2.0391788014522313e-06, + "loss": 0.1632, + "num_tokens": 4344358399.0, + "step": 5696 + }, + { + "epoch": 7.772918822613168, + "grad_norm": 0.22661777065008248, + "learning_rate": 2.0387168504899954e-06, + "loss": 0.1663, + "num_tokens": 4345143797.0, + "step": 5697 + }, + { + "epoch": 7.774284403571848, + "grad_norm": 0.23498770018744924, + "learning_rate": 2.0382576332271683e-06, + "loss": 0.165, + "num_tokens": 4345873593.0, + "step": 5698 + }, + { + "epoch": 7.775649984530528, + "grad_norm": 0.23078714946272702, + "learning_rate": 2.0378011498038373e-06, + "loss": 0.1627, + "num_tokens": 4346677840.0, + "step": 5699 + }, + { + "epoch": 7.777015565489209, + "grad_norm": 0.21666362952645107, + "learning_rate": 2.0373474003592554e-06, + "loss": 0.1617, + "num_tokens": 4347518016.0, + "step": 5700 + }, + { + "epoch": 7.778381146447889, + "grad_norm": 0.27560741888472323, + "learning_rate": 2.0368963850318423e-06, + "loss": 0.1669, + "num_tokens": 4348242363.0, + "step": 5701 + }, + { + "epoch": 7.77974672740657, + "grad_norm": 0.2304284975952098, + "learning_rate": 2.036448103959183e-06, + "loss": 0.1667, + "num_tokens": 4349004666.0, + "step": 5702 + }, + { + "epoch": 7.78111230836525, + "grad_norm": 0.22658286695172702, + "learning_rate": 2.0360025572780296e-06, + "loss": 0.167, + "num_tokens": 4349746566.0, + "step": 5703 + }, + { + "epoch": 7.782477889323931, + "grad_norm": 0.23824088289721332, + "learning_rate": 2.035559745124297e-06, + "loss": 0.1594, + "num_tokens": 4350522403.0, + "step": 5704 + }, + { + "epoch": 7.783843470282611, + "grad_norm": 0.2354651711644883, + "learning_rate": 2.035119667633071e-06, + "loss": 0.1712, + "num_tokens": 4351323412.0, + "step": 5705 + }, + { + "epoch": 7.785209051241292, + "grad_norm": 0.2231076846835459, + "learning_rate": 2.0346823249385964e-06, + "loss": 0.1687, + "num_tokens": 4352090319.0, + "step": 5706 + }, + { + "epoch": 7.786574632199972, + "grad_norm": 0.2423707454473669, + "learning_rate": 2.0342477171742903e-06, + "loss": 0.1695, + "num_tokens": 4352829996.0, + "step": 5707 + }, + { + "epoch": 7.787940213158652, + "grad_norm": 0.23278357525332485, + "learning_rate": 2.0338158444727327e-06, + "loss": 0.1609, + "num_tokens": 4353605066.0, + "step": 5708 + }, + { + "epoch": 7.789305794117333, + "grad_norm": 0.23088120536019338, + "learning_rate": 2.033386706965668e-06, + "loss": 0.1519, + "num_tokens": 4354325949.0, + "step": 5709 + }, + { + "epoch": 7.7906713750760135, + "grad_norm": 0.22316114721375432, + "learning_rate": 2.0329603047840076e-06, + "loss": 0.1632, + "num_tokens": 4355152884.0, + "step": 5710 + }, + { + "epoch": 7.7920369560346945, + "grad_norm": 0.22193539125647546, + "learning_rate": 2.032536638057829e-06, + "loss": 0.1673, + "num_tokens": 4355961251.0, + "step": 5711 + }, + { + "epoch": 7.793402536993375, + "grad_norm": 0.2209789640136189, + "learning_rate": 2.0321157069163745e-06, + "loss": 0.166, + "num_tokens": 4356759682.0, + "step": 5712 + }, + { + "epoch": 7.794768117952056, + "grad_norm": 0.2298567191915556, + "learning_rate": 2.0316975114880514e-06, + "loss": 0.1609, + "num_tokens": 4357551264.0, + "step": 5713 + }, + { + "epoch": 7.796133698910736, + "grad_norm": 0.22451504309700104, + "learning_rate": 2.031282051900434e-06, + "loss": 0.1663, + "num_tokens": 4358305887.0, + "step": 5714 + }, + { + "epoch": 7.797499279869417, + "grad_norm": 0.22725607142980153, + "learning_rate": 2.03086932828026e-06, + "loss": 0.1662, + "num_tokens": 4359098670.0, + "step": 5715 + }, + { + "epoch": 7.798864860828097, + "grad_norm": 0.22606529697135308, + "learning_rate": 2.030459340753434e-06, + "loss": 0.1639, + "num_tokens": 4359918950.0, + "step": 5716 + }, + { + "epoch": 7.800230441786777, + "grad_norm": 0.24451090606679707, + "learning_rate": 2.0300520894450256e-06, + "loss": 0.1614, + "num_tokens": 4360609556.0, + "step": 5717 + }, + { + "epoch": 7.801596022745458, + "grad_norm": 0.2306603844569006, + "learning_rate": 2.0296475744792686e-06, + "loss": 0.1662, + "num_tokens": 4361378677.0, + "step": 5718 + }, + { + "epoch": 7.802961603704138, + "grad_norm": 0.2168357129117411, + "learning_rate": 2.029245795979564e-06, + "loss": 0.1593, + "num_tokens": 4362154272.0, + "step": 5719 + }, + { + "epoch": 7.804327184662819, + "grad_norm": 0.2367226112014442, + "learning_rate": 2.028846754068477e-06, + "loss": 0.1569, + "num_tokens": 4362891551.0, + "step": 5720 + }, + { + "epoch": 7.805692765621499, + "grad_norm": 0.22715230304945933, + "learning_rate": 2.0284504488677378e-06, + "loss": 0.1628, + "num_tokens": 4363717693.0, + "step": 5721 + }, + { + "epoch": 7.80705834658018, + "grad_norm": 0.24923957447817674, + "learning_rate": 2.0280568804982413e-06, + "loss": 0.1611, + "num_tokens": 4364430068.0, + "step": 5722 + }, + { + "epoch": 7.80842392753886, + "grad_norm": 0.2239697470869312, + "learning_rate": 2.0276660490800493e-06, + "loss": 0.1729, + "num_tokens": 4365146133.0, + "step": 5723 + }, + { + "epoch": 7.809789508497541, + "grad_norm": 0.23542700461522417, + "learning_rate": 2.027277954732387e-06, + "loss": 0.1534, + "num_tokens": 4365900939.0, + "step": 5724 + }, + { + "epoch": 7.811155089456221, + "grad_norm": 0.2216613721472706, + "learning_rate": 2.0268925975736448e-06, + "loss": 0.154, + "num_tokens": 4366649632.0, + "step": 5725 + }, + { + "epoch": 7.8125206704149015, + "grad_norm": 0.2269030139121805, + "learning_rate": 2.026509977721379e-06, + "loss": 0.1672, + "num_tokens": 4367449728.0, + "step": 5726 + }, + { + "epoch": 7.8138862513735825, + "grad_norm": 0.218361026103539, + "learning_rate": 2.0261300952923098e-06, + "loss": 0.1571, + "num_tokens": 4368233746.0, + "step": 5727 + }, + { + "epoch": 7.815251832332263, + "grad_norm": 0.23208570208713702, + "learning_rate": 2.0257529504023237e-06, + "loss": 0.1581, + "num_tokens": 4368952106.0, + "step": 5728 + }, + { + "epoch": 7.816617413290944, + "grad_norm": 0.23705078823700867, + "learning_rate": 2.0253785431664707e-06, + "loss": 0.1612, + "num_tokens": 4369733180.0, + "step": 5729 + }, + { + "epoch": 7.817982994249624, + "grad_norm": 0.2299089153466336, + "learning_rate": 2.025006873698966e-06, + "loss": 0.1633, + "num_tokens": 4370484183.0, + "step": 5730 + }, + { + "epoch": 7.819348575208305, + "grad_norm": 0.2390415799414524, + "learning_rate": 2.024637942113191e-06, + "loss": 0.1625, + "num_tokens": 4371231738.0, + "step": 5731 + }, + { + "epoch": 7.820714156166985, + "grad_norm": 0.23699975754806069, + "learning_rate": 2.02427174852169e-06, + "loss": 0.1625, + "num_tokens": 4371947466.0, + "step": 5732 + }, + { + "epoch": 7.822079737125666, + "grad_norm": 0.24114459665648982, + "learning_rate": 2.023908293036172e-06, + "loss": 0.167, + "num_tokens": 4372706425.0, + "step": 5733 + }, + { + "epoch": 7.823445318084346, + "grad_norm": 0.2318576601466569, + "learning_rate": 2.023547575767512e-06, + "loss": 0.1616, + "num_tokens": 4373479554.0, + "step": 5734 + }, + { + "epoch": 7.824810899043026, + "grad_norm": 0.2286933648806978, + "learning_rate": 2.0231895968257503e-06, + "loss": 0.1626, + "num_tokens": 4374191308.0, + "step": 5735 + }, + { + "epoch": 7.826176480001707, + "grad_norm": 0.22783816435490872, + "learning_rate": 2.0228343563200904e-06, + "loss": 0.1698, + "num_tokens": 4374959117.0, + "step": 5736 + }, + { + "epoch": 7.827542060960387, + "grad_norm": 0.22249672129837558, + "learning_rate": 2.0224818543589e-06, + "loss": 0.1626, + "num_tokens": 4375759560.0, + "step": 5737 + }, + { + "epoch": 7.828907641919068, + "grad_norm": 0.23352062420770436, + "learning_rate": 2.0221320910497117e-06, + "loss": 0.1645, + "num_tokens": 4376504922.0, + "step": 5738 + }, + { + "epoch": 7.830273222877748, + "grad_norm": 0.22390704223636193, + "learning_rate": 2.0217850664992252e-06, + "loss": 0.1636, + "num_tokens": 4377284243.0, + "step": 5739 + }, + { + "epoch": 7.831638803836429, + "grad_norm": 0.24637578668792356, + "learning_rate": 2.0214407808133007e-06, + "loss": 0.1599, + "num_tokens": 4378062143.0, + "step": 5740 + }, + { + "epoch": 7.833004384795109, + "grad_norm": 0.23071453219811044, + "learning_rate": 2.0210992340969666e-06, + "loss": 0.1672, + "num_tokens": 4378843580.0, + "step": 5741 + }, + { + "epoch": 7.83436996575379, + "grad_norm": 0.23710977812868814, + "learning_rate": 2.0207604264544124e-06, + "loss": 0.1698, + "num_tokens": 4379566578.0, + "step": 5742 + }, + { + "epoch": 7.8357355467124705, + "grad_norm": 0.22746995952955124, + "learning_rate": 2.020424357988994e-06, + "loss": 0.1609, + "num_tokens": 4380375684.0, + "step": 5743 + }, + { + "epoch": 7.837101127671151, + "grad_norm": 0.22590324113629393, + "learning_rate": 2.020091028803233e-06, + "loss": 0.1604, + "num_tokens": 4381111621.0, + "step": 5744 + }, + { + "epoch": 7.838466708629832, + "grad_norm": 0.236031069708685, + "learning_rate": 2.0197604389988102e-06, + "loss": 0.1638, + "num_tokens": 4381847102.0, + "step": 5745 + }, + { + "epoch": 7.839832289588512, + "grad_norm": 0.22523824978792492, + "learning_rate": 2.019432588676578e-06, + "loss": 0.1638, + "num_tokens": 4382647528.0, + "step": 5746 + }, + { + "epoch": 7.841197870547193, + "grad_norm": 0.2400957039473371, + "learning_rate": 2.0191074779365467e-06, + "loss": 0.1637, + "num_tokens": 4383335820.0, + "step": 5747 + }, + { + "epoch": 7.842563451505873, + "grad_norm": 0.22913069777931405, + "learning_rate": 2.018785106877895e-06, + "loss": 0.1638, + "num_tokens": 4384130788.0, + "step": 5748 + }, + { + "epoch": 7.843929032464554, + "grad_norm": 0.3513505194952437, + "learning_rate": 2.0184654755989637e-06, + "loss": 0.1629, + "num_tokens": 4384836634.0, + "step": 5749 + }, + { + "epoch": 7.845294613423234, + "grad_norm": 0.23031107382293126, + "learning_rate": 2.018148584197258e-06, + "loss": 0.1581, + "num_tokens": 4385552495.0, + "step": 5750 + }, + { + "epoch": 7.846660194381915, + "grad_norm": 0.23592917949762848, + "learning_rate": 2.0178344327694498e-06, + "loss": 0.1648, + "num_tokens": 4386363784.0, + "step": 5751 + }, + { + "epoch": 7.848025775340595, + "grad_norm": 0.21840499656084786, + "learning_rate": 2.01752302141137e-06, + "loss": 0.1577, + "num_tokens": 4387160920.0, + "step": 5752 + }, + { + "epoch": 7.849391356299275, + "grad_norm": 0.23721222503906794, + "learning_rate": 2.01721435021802e-06, + "loss": 0.1575, + "num_tokens": 4387875521.0, + "step": 5753 + }, + { + "epoch": 7.850756937257956, + "grad_norm": 0.2161465991853738, + "learning_rate": 2.0169084192835594e-06, + "loss": 0.1698, + "num_tokens": 4388702632.0, + "step": 5754 + }, + { + "epoch": 7.852122518216636, + "grad_norm": 0.22947820256753523, + "learning_rate": 2.016605228701315e-06, + "loss": 0.1676, + "num_tokens": 4389482307.0, + "step": 5755 + }, + { + "epoch": 7.853488099175317, + "grad_norm": 0.2216088296414581, + "learning_rate": 2.0163047785637786e-06, + "loss": 0.1625, + "num_tokens": 4390253423.0, + "step": 5756 + }, + { + "epoch": 7.854853680133997, + "grad_norm": 0.2291404959661472, + "learning_rate": 2.0160070689626035e-06, + "loss": 0.1693, + "num_tokens": 4390994550.0, + "step": 5757 + }, + { + "epoch": 7.856219261092678, + "grad_norm": 0.2426440145807451, + "learning_rate": 2.0157120999886073e-06, + "loss": 0.1669, + "num_tokens": 4391709670.0, + "step": 5758 + }, + { + "epoch": 7.8575848420513585, + "grad_norm": 0.2224656898226779, + "learning_rate": 2.0154198717317733e-06, + "loss": 0.1664, + "num_tokens": 4392487443.0, + "step": 5759 + }, + { + "epoch": 7.8589504230100395, + "grad_norm": 0.23013506086045252, + "learning_rate": 2.0151303842812472e-06, + "loss": 0.1651, + "num_tokens": 4393185793.0, + "step": 5760 + }, + { + "epoch": 7.86031600396872, + "grad_norm": 0.2423108466517439, + "learning_rate": 2.0148436377253393e-06, + "loss": 0.1676, + "num_tokens": 4393888622.0, + "step": 5761 + }, + { + "epoch": 7.8616815849274, + "grad_norm": 0.21993651990189325, + "learning_rate": 2.014559632151523e-06, + "loss": 0.1651, + "num_tokens": 4394729601.0, + "step": 5762 + }, + { + "epoch": 7.863047165886081, + "grad_norm": 0.23443289705509907, + "learning_rate": 2.0142783676464377e-06, + "loss": 0.1556, + "num_tokens": 4395423553.0, + "step": 5763 + }, + { + "epoch": 7.864412746844761, + "grad_norm": 0.22060208411532642, + "learning_rate": 2.0139998442958834e-06, + "loss": 0.1645, + "num_tokens": 4396147564.0, + "step": 5764 + }, + { + "epoch": 7.865778327803442, + "grad_norm": 0.2270216831338646, + "learning_rate": 2.0137240621848257e-06, + "loss": 0.1617, + "num_tokens": 4396908754.0, + "step": 5765 + }, + { + "epoch": 7.867143908762122, + "grad_norm": 0.23110200613689, + "learning_rate": 2.013451021397394e-06, + "loss": 0.1617, + "num_tokens": 4397668319.0, + "step": 5766 + }, + { + "epoch": 7.868509489720803, + "grad_norm": 0.22949546828353412, + "learning_rate": 2.013180722016882e-06, + "loss": 0.1664, + "num_tokens": 4398412988.0, + "step": 5767 + }, + { + "epoch": 7.869875070679483, + "grad_norm": 0.22913739954759696, + "learning_rate": 2.0129131641257456e-06, + "loss": 0.1699, + "num_tokens": 4399222157.0, + "step": 5768 + }, + { + "epoch": 7.871240651638164, + "grad_norm": 0.2369859196539888, + "learning_rate": 2.012648347805605e-06, + "loss": 0.1643, + "num_tokens": 4399945292.0, + "step": 5769 + }, + { + "epoch": 7.872606232596844, + "grad_norm": 0.23879210479187435, + "learning_rate": 2.012386273137244e-06, + "loss": 0.1613, + "num_tokens": 4400650315.0, + "step": 5770 + }, + { + "epoch": 7.873971813555524, + "grad_norm": 0.23240493322999725, + "learning_rate": 2.012126940200611e-06, + "loss": 0.1679, + "num_tokens": 4401418763.0, + "step": 5771 + }, + { + "epoch": 7.875337394514205, + "grad_norm": 0.23463897466912376, + "learning_rate": 2.0118703490748164e-06, + "loss": 0.1609, + "num_tokens": 4402151992.0, + "step": 5772 + }, + { + "epoch": 7.8767029754728854, + "grad_norm": 0.23305948495299283, + "learning_rate": 2.011616499838136e-06, + "loss": 0.1646, + "num_tokens": 4402951899.0, + "step": 5773 + }, + { + "epoch": 7.8780685564315664, + "grad_norm": 0.22931067591527607, + "learning_rate": 2.0113653925680078e-06, + "loss": 0.1612, + "num_tokens": 4403722148.0, + "step": 5774 + }, + { + "epoch": 7.879434137390247, + "grad_norm": 0.2310737475518446, + "learning_rate": 2.0111170273410334e-06, + "loss": 0.1629, + "num_tokens": 4404501409.0, + "step": 5775 + }, + { + "epoch": 7.880799718348928, + "grad_norm": 0.23150549262900103, + "learning_rate": 2.0108714042329787e-06, + "loss": 0.1666, + "num_tokens": 4405223566.0, + "step": 5776 + }, + { + "epoch": 7.882165299307608, + "grad_norm": 0.24904634431804173, + "learning_rate": 2.0106285233187716e-06, + "loss": 0.169, + "num_tokens": 4406055533.0, + "step": 5777 + }, + { + "epoch": 7.883530880266289, + "grad_norm": 0.22853614951874485, + "learning_rate": 2.0103883846725065e-06, + "loss": 0.1566, + "num_tokens": 4406827390.0, + "step": 5778 + }, + { + "epoch": 7.884896461224969, + "grad_norm": 0.23626205255810287, + "learning_rate": 2.0101509883674377e-06, + "loss": 0.1739, + "num_tokens": 4407627513.0, + "step": 5779 + }, + { + "epoch": 7.886262042183649, + "grad_norm": 0.2259126194016761, + "learning_rate": 2.0099163344759857e-06, + "loss": 0.1676, + "num_tokens": 4408432536.0, + "step": 5780 + }, + { + "epoch": 7.88762762314233, + "grad_norm": 0.22755757564161358, + "learning_rate": 2.009684423069732e-06, + "loss": 0.1648, + "num_tokens": 4409200492.0, + "step": 5781 + }, + { + "epoch": 7.88899320410101, + "grad_norm": 0.22455917801932146, + "learning_rate": 2.0094552542194236e-06, + "loss": 0.1643, + "num_tokens": 4410004029.0, + "step": 5782 + }, + { + "epoch": 7.890358785059691, + "grad_norm": 0.2279879572280495, + "learning_rate": 2.0092288279949696e-06, + "loss": 0.1602, + "num_tokens": 4410748180.0, + "step": 5783 + }, + { + "epoch": 7.891724366018371, + "grad_norm": 0.22776380631196652, + "learning_rate": 2.009005144465443e-06, + "loss": 0.1584, + "num_tokens": 4411509424.0, + "step": 5784 + }, + { + "epoch": 7.893089946977052, + "grad_norm": 0.22992693555650937, + "learning_rate": 2.0087842036990806e-06, + "loss": 0.1669, + "num_tokens": 4412307429.0, + "step": 5785 + }, + { + "epoch": 7.894455527935732, + "grad_norm": 0.2343023393780146, + "learning_rate": 2.00856600576328e-06, + "loss": 0.1647, + "num_tokens": 4413097684.0, + "step": 5786 + }, + { + "epoch": 7.895821108894413, + "grad_norm": 0.2206469111461284, + "learning_rate": 2.008350550724606e-06, + "loss": 0.163, + "num_tokens": 4413939799.0, + "step": 5787 + }, + { + "epoch": 7.897186689853093, + "grad_norm": 0.2273835199921538, + "learning_rate": 2.0081378386487837e-06, + "loss": 0.1567, + "num_tokens": 4414626930.0, + "step": 5788 + }, + { + "epoch": 7.8985522708117735, + "grad_norm": 0.22444434641868596, + "learning_rate": 2.0079278696007023e-06, + "loss": 0.1666, + "num_tokens": 4415347297.0, + "step": 5789 + }, + { + "epoch": 7.8999178517704545, + "grad_norm": 0.2292479245554363, + "learning_rate": 2.0077206436444145e-06, + "loss": 0.163, + "num_tokens": 4416152337.0, + "step": 5790 + }, + { + "epoch": 7.901283432729135, + "grad_norm": 0.22513725162752082, + "learning_rate": 2.007516160843135e-06, + "loss": 0.1668, + "num_tokens": 4416914935.0, + "step": 5791 + }, + { + "epoch": 7.902649013687816, + "grad_norm": 0.23789816066470512, + "learning_rate": 2.007314421259245e-06, + "loss": 0.1663, + "num_tokens": 4417712092.0, + "step": 5792 + }, + { + "epoch": 7.904014594646496, + "grad_norm": 0.23557190487652216, + "learning_rate": 2.0071154249542837e-06, + "loss": 0.164, + "num_tokens": 4418459100.0, + "step": 5793 + }, + { + "epoch": 7.905380175605177, + "grad_norm": 0.22926588464163575, + "learning_rate": 2.006919171988958e-06, + "loss": 0.1711, + "num_tokens": 4419199699.0, + "step": 5794 + }, + { + "epoch": 7.906745756563857, + "grad_norm": 0.24726729400574046, + "learning_rate": 2.0067256624231356e-06, + "loss": 0.1628, + "num_tokens": 4419940643.0, + "step": 5795 + }, + { + "epoch": 7.908111337522538, + "grad_norm": 0.24011560569304177, + "learning_rate": 2.0065348963158477e-06, + "loss": 0.1577, + "num_tokens": 4420648349.0, + "step": 5796 + }, + { + "epoch": 7.909476918481218, + "grad_norm": 0.2214240962632836, + "learning_rate": 2.0063468737252896e-06, + "loss": 0.1539, + "num_tokens": 4421479249.0, + "step": 5797 + }, + { + "epoch": 7.910842499439898, + "grad_norm": 0.2534549203367501, + "learning_rate": 2.0061615947088177e-06, + "loss": 0.1707, + "num_tokens": 4422236236.0, + "step": 5798 + }, + { + "epoch": 7.912208080398579, + "grad_norm": 0.21514106150071363, + "learning_rate": 2.0059790593229532e-06, + "loss": 0.1633, + "num_tokens": 4423086163.0, + "step": 5799 + }, + { + "epoch": 7.913573661357259, + "grad_norm": 0.22136366756049505, + "learning_rate": 2.005799267623381e-06, + "loss": 0.1618, + "num_tokens": 4423859762.0, + "step": 5800 + }, + { + "epoch": 7.91493924231594, + "grad_norm": 0.24422040870992315, + "learning_rate": 2.005622219664945e-06, + "loss": 0.1589, + "num_tokens": 4424599649.0, + "step": 5801 + }, + { + "epoch": 7.91630482327462, + "grad_norm": 0.2285332856698791, + "learning_rate": 2.005447915501657e-06, + "loss": 0.1599, + "num_tokens": 4425355237.0, + "step": 5802 + }, + { + "epoch": 7.917670404233301, + "grad_norm": 0.2274700619622788, + "learning_rate": 2.00527635518669e-06, + "loss": 0.1589, + "num_tokens": 4426050404.0, + "step": 5803 + }, + { + "epoch": 7.919035985191981, + "grad_norm": 0.22806067447867245, + "learning_rate": 2.005107538772377e-06, + "loss": 0.1614, + "num_tokens": 4426810280.0, + "step": 5804 + }, + { + "epoch": 7.920401566150662, + "grad_norm": 0.2204717274411671, + "learning_rate": 2.004941466310219e-06, + "loss": 0.1631, + "num_tokens": 4427622329.0, + "step": 5805 + }, + { + "epoch": 7.9217671471093425, + "grad_norm": 0.23261722589902298, + "learning_rate": 2.004778137850877e-06, + "loss": 0.1613, + "num_tokens": 4428385467.0, + "step": 5806 + }, + { + "epoch": 7.923132728068023, + "grad_norm": 0.22765915015892715, + "learning_rate": 2.0046175534441746e-06, + "loss": 0.1653, + "num_tokens": 4429176875.0, + "step": 5807 + }, + { + "epoch": 7.924498309026704, + "grad_norm": 0.24113808654934155, + "learning_rate": 2.0044597131391005e-06, + "loss": 0.1601, + "num_tokens": 4429877116.0, + "step": 5808 + }, + { + "epoch": 7.925863889985384, + "grad_norm": 0.23003361406376932, + "learning_rate": 2.0043046169838036e-06, + "loss": 0.166, + "num_tokens": 4430638504.0, + "step": 5809 + }, + { + "epoch": 7.927229470944065, + "grad_norm": 0.23594396696863296, + "learning_rate": 2.0041522650255984e-06, + "loss": 0.1751, + "num_tokens": 4431409379.0, + "step": 5810 + }, + { + "epoch": 7.928595051902745, + "grad_norm": 0.22624894422767877, + "learning_rate": 2.0040026573109598e-06, + "loss": 0.1668, + "num_tokens": 4432204821.0, + "step": 5811 + }, + { + "epoch": 7.929960632861426, + "grad_norm": 0.2323060424036415, + "learning_rate": 2.0038557938855255e-06, + "loss": 0.1745, + "num_tokens": 4432994112.0, + "step": 5812 + }, + { + "epoch": 7.931326213820106, + "grad_norm": 0.2369827939123705, + "learning_rate": 2.0037116747941e-06, + "loss": 0.1606, + "num_tokens": 4433710631.0, + "step": 5813 + }, + { + "epoch": 7.932691794778787, + "grad_norm": 0.22812058939469612, + "learning_rate": 2.003570300080647e-06, + "loss": 0.1617, + "num_tokens": 4434489370.0, + "step": 5814 + }, + { + "epoch": 7.934057375737467, + "grad_norm": 0.22354799730178548, + "learning_rate": 2.0034316697882926e-06, + "loss": 0.1681, + "num_tokens": 4435291244.0, + "step": 5815 + }, + { + "epoch": 7.935422956696147, + "grad_norm": 0.22645349374305923, + "learning_rate": 2.0032957839593266e-06, + "loss": 0.1694, + "num_tokens": 4436063853.0, + "step": 5816 + }, + { + "epoch": 7.936788537654828, + "grad_norm": 0.23311485342332058, + "learning_rate": 2.0031626426352035e-06, + "loss": 0.166, + "num_tokens": 4436795008.0, + "step": 5817 + }, + { + "epoch": 7.938154118613508, + "grad_norm": 0.23052284802768483, + "learning_rate": 2.003032245856538e-06, + "loss": 0.162, + "num_tokens": 4437604560.0, + "step": 5818 + }, + { + "epoch": 7.939519699572189, + "grad_norm": 0.23153608092506478, + "learning_rate": 2.002904593663109e-06, + "loss": 0.1617, + "num_tokens": 4438322936.0, + "step": 5819 + }, + { + "epoch": 7.940885280530869, + "grad_norm": 0.22916282215184833, + "learning_rate": 2.0027796860938575e-06, + "loss": 0.1686, + "num_tokens": 4439182873.0, + "step": 5820 + }, + { + "epoch": 7.94225086148955, + "grad_norm": 0.22583611489705843, + "learning_rate": 2.0026575231868873e-06, + "loss": 0.1627, + "num_tokens": 4439938860.0, + "step": 5821 + }, + { + "epoch": 7.9436164424482305, + "grad_norm": 0.22713970553575472, + "learning_rate": 2.0025381049794645e-06, + "loss": 0.1614, + "num_tokens": 4440665271.0, + "step": 5822 + }, + { + "epoch": 7.9449820234069115, + "grad_norm": 0.22368015856935375, + "learning_rate": 2.0024214315080195e-06, + "loss": 0.163, + "num_tokens": 4441501480.0, + "step": 5823 + }, + { + "epoch": 7.946347604365592, + "grad_norm": 0.22659122828025474, + "learning_rate": 2.0023075028081433e-06, + "loss": 0.1724, + "num_tokens": 4442324090.0, + "step": 5824 + }, + { + "epoch": 7.947713185324272, + "grad_norm": 0.22882080487759981, + "learning_rate": 2.002196318914591e-06, + "loss": 0.166, + "num_tokens": 4443171301.0, + "step": 5825 + }, + { + "epoch": 7.949078766282953, + "grad_norm": 0.23451597926457088, + "learning_rate": 2.00208787986128e-06, + "loss": 0.163, + "num_tokens": 4443929898.0, + "step": 5826 + }, + { + "epoch": 7.950444347241633, + "grad_norm": 0.24043210321592215, + "learning_rate": 2.001982185681291e-06, + "loss": 0.1629, + "num_tokens": 4444648349.0, + "step": 5827 + }, + { + "epoch": 7.951809928200314, + "grad_norm": 0.2529564157296662, + "learning_rate": 2.001879236406866e-06, + "loss": 0.1695, + "num_tokens": 4445342332.0, + "step": 5828 + }, + { + "epoch": 7.953175509158994, + "grad_norm": 0.2281000859942969, + "learning_rate": 2.0017790320694103e-06, + "loss": 0.1571, + "num_tokens": 4446061984.0, + "step": 5829 + }, + { + "epoch": 7.954541090117675, + "grad_norm": 0.2317148264340576, + "learning_rate": 2.001681572699492e-06, + "loss": 0.1693, + "num_tokens": 4446847852.0, + "step": 5830 + }, + { + "epoch": 7.955906671076355, + "grad_norm": 0.23600375494856554, + "learning_rate": 2.001586858326842e-06, + "loss": 0.172, + "num_tokens": 4447628621.0, + "step": 5831 + }, + { + "epoch": 7.957272252035036, + "grad_norm": 0.2438173260041368, + "learning_rate": 2.0014948889803537e-06, + "loss": 0.156, + "num_tokens": 4448297652.0, + "step": 5832 + }, + { + "epoch": 7.958637832993716, + "grad_norm": 0.23376242238024142, + "learning_rate": 2.001405664688082e-06, + "loss": 0.161, + "num_tokens": 4449126333.0, + "step": 5833 + }, + { + "epoch": 7.960003413952396, + "grad_norm": 0.22917588956974264, + "learning_rate": 2.0013191854772457e-06, + "loss": 0.1599, + "num_tokens": 4449876491.0, + "step": 5834 + }, + { + "epoch": 7.961368994911077, + "grad_norm": 0.23614418517987648, + "learning_rate": 2.0012354513742276e-06, + "loss": 0.1618, + "num_tokens": 4450570027.0, + "step": 5835 + }, + { + "epoch": 7.962734575869757, + "grad_norm": 0.23976807978458395, + "learning_rate": 2.0011544624045685e-06, + "loss": 0.1655, + "num_tokens": 4451297840.0, + "step": 5836 + }, + { + "epoch": 7.964100156828438, + "grad_norm": 0.25022630745065805, + "learning_rate": 2.001076218592977e-06, + "loss": 0.1688, + "num_tokens": 4452001197.0, + "step": 5837 + }, + { + "epoch": 7.9654657377871185, + "grad_norm": 0.23063561912550065, + "learning_rate": 2.001000719963321e-06, + "loss": 0.1599, + "num_tokens": 4452751462.0, + "step": 5838 + }, + { + "epoch": 7.9668313187457995, + "grad_norm": 0.23641585980107854, + "learning_rate": 2.000927966538631e-06, + "loss": 0.1643, + "num_tokens": 4453564004.0, + "step": 5839 + }, + { + "epoch": 7.96819689970448, + "grad_norm": 0.22947367409534156, + "learning_rate": 2.000857958341102e-06, + "loss": 0.1606, + "num_tokens": 4454328075.0, + "step": 5840 + }, + { + "epoch": 7.969562480663161, + "grad_norm": 0.24899531725306326, + "learning_rate": 2.00079069539209e-06, + "loss": 0.1564, + "num_tokens": 4455088262.0, + "step": 5841 + }, + { + "epoch": 7.970928061621841, + "grad_norm": 0.22776711538512154, + "learning_rate": 2.0007261777121147e-06, + "loss": 0.164, + "num_tokens": 4455918584.0, + "step": 5842 + }, + { + "epoch": 7.972293642580521, + "grad_norm": 0.28848476696571973, + "learning_rate": 2.0006644053208566e-06, + "loss": 0.1633, + "num_tokens": 4456703635.0, + "step": 5843 + }, + { + "epoch": 7.973659223539202, + "grad_norm": 0.28223751443149914, + "learning_rate": 2.000605378237161e-06, + "loss": 0.1701, + "num_tokens": 4457505976.0, + "step": 5844 + }, + { + "epoch": 7.975024804497882, + "grad_norm": 0.23072968678468886, + "learning_rate": 2.000549096479034e-06, + "loss": 0.1667, + "num_tokens": 4458334172.0, + "step": 5845 + }, + { + "epoch": 7.976390385456563, + "grad_norm": 0.2377879780340767, + "learning_rate": 2.0004955600636437e-06, + "loss": 0.1674, + "num_tokens": 4459129273.0, + "step": 5846 + }, + { + "epoch": 7.977755966415243, + "grad_norm": 0.2304807991146126, + "learning_rate": 2.000444769007323e-06, + "loss": 0.1666, + "num_tokens": 4459890208.0, + "step": 5847 + }, + { + "epoch": 7.979121547373924, + "grad_norm": 0.23782296374662132, + "learning_rate": 2.000396723325566e-06, + "loss": 0.164, + "num_tokens": 4460602020.0, + "step": 5848 + }, + { + "epoch": 7.980487128332604, + "grad_norm": 0.248930851723284, + "learning_rate": 2.000351423033029e-06, + "loss": 0.1556, + "num_tokens": 4461314684.0, + "step": 5849 + }, + { + "epoch": 7.981852709291285, + "grad_norm": 0.22867970105483598, + "learning_rate": 2.0003088681435307e-06, + "loss": 0.1642, + "num_tokens": 4462016477.0, + "step": 5850 + }, + { + "epoch": 7.983218290249965, + "grad_norm": 0.21992654659741154, + "learning_rate": 2.0002690586700542e-06, + "loss": 0.159, + "num_tokens": 4462791454.0, + "step": 5851 + }, + { + "epoch": 7.984583871208645, + "grad_norm": 0.22730552138059296, + "learning_rate": 2.000231994624741e-06, + "loss": 0.1615, + "num_tokens": 4463584158.0, + "step": 5852 + }, + { + "epoch": 7.985949452167326, + "grad_norm": 0.22994005126658953, + "learning_rate": 2.000197676018901e-06, + "loss": 0.1682, + "num_tokens": 4464412790.0, + "step": 5853 + }, + { + "epoch": 7.9873150331260065, + "grad_norm": 0.2353155350076908, + "learning_rate": 2.000166102863001e-06, + "loss": 0.1571, + "num_tokens": 4465099606.0, + "step": 5854 + }, + { + "epoch": 7.9886806140846875, + "grad_norm": 0.23455558669834398, + "learning_rate": 2.000137275166673e-06, + "loss": 0.1614, + "num_tokens": 4465810649.0, + "step": 5855 + }, + { + "epoch": 7.990046195043368, + "grad_norm": 0.2316126251583514, + "learning_rate": 2.000111192938713e-06, + "loss": 0.1584, + "num_tokens": 4466574254.0, + "step": 5856 + }, + { + "epoch": 7.991411776002049, + "grad_norm": 0.25012097239487935, + "learning_rate": 2.0000878561870745e-06, + "loss": 0.1685, + "num_tokens": 4467355861.0, + "step": 5857 + }, + { + "epoch": 7.992777356960729, + "grad_norm": 0.22797698087241033, + "learning_rate": 2.0000672649188782e-06, + "loss": 0.1671, + "num_tokens": 4468156350.0, + "step": 5858 + }, + { + "epoch": 7.99414293791941, + "grad_norm": 0.22354187453919552, + "learning_rate": 2.0000494191404064e-06, + "loss": 0.1636, + "num_tokens": 4468921301.0, + "step": 5859 + }, + { + "epoch": 7.99550851887809, + "grad_norm": 0.23043010466861574, + "learning_rate": 2.0000343188571013e-06, + "loss": 0.1703, + "num_tokens": 4469761157.0, + "step": 5860 + }, + { + "epoch": 7.99687409983677, + "grad_norm": 0.2402547414806548, + "learning_rate": 2.0000219640735703e-06, + "loss": 0.1635, + "num_tokens": 4470542645.0, + "step": 5861 + }, + { + "epoch": 7.998239680795451, + "grad_norm": 0.21872558371132542, + "learning_rate": 2.0000123547935814e-06, + "loss": 0.1575, + "num_tokens": 4471318562.0, + "step": 5862 + }, + { + "epoch": 7.999605261754131, + "grad_norm": 0.22836278980538602, + "learning_rate": 2.000005491020068e-06, + "loss": 0.167, + "num_tokens": 4472100076.0, + "step": 5863 + }, + { + "epoch": 8.0, + "grad_norm": 0.22836278980538602, + "learning_rate": 2.0000013727551216e-06, + "loss": 0.1635, + "num_tokens": 4472337088.0, + "step": 5864 + }, + { + "epoch": 8.0, + "step": 5864, + "total_flos": 1.6577411091894632e+19, + "train_loss": 0.26399336885381364, + "train_runtime": 845440.9049, + "train_samples_per_second": 0.887, + "train_steps_per_second": 0.007 + } + ], + "logging_steps": 1, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6577411091894632e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}