{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 255, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011764705882352941, "grad_norm": 3.6131326491907996, "learning_rate": 9.090909090909091e-07, "loss": 0.7929452061653137, "num_input_tokens_seen": 0, "step": 1 }, { "epoch": 0.023529411764705882, "grad_norm": 3.9380904690801763, "learning_rate": 1.8181818181818183e-06, "loss": 0.8463644981384277, "num_input_tokens_seen": 0, "step": 2 }, { "epoch": 0.03529411764705882, "grad_norm": 3.5075707953425375, "learning_rate": 2.7272727272727272e-06, "loss": 0.7615697383880615, "num_input_tokens_seen": 0, "step": 3 }, { "epoch": 0.047058823529411764, "grad_norm": 3.568991966465913, "learning_rate": 3.6363636363636366e-06, "loss": 0.7781215906143188, "num_input_tokens_seen": 0, "step": 4 }, { "epoch": 0.058823529411764705, "grad_norm": 3.6216549878210613, "learning_rate": 4.5454545454545455e-06, "loss": 0.8030836582183838, "num_input_tokens_seen": 0, "step": 5 }, { "epoch": 0.07058823529411765, "grad_norm": 3.0788368926498566, "learning_rate": 5.4545454545454545e-06, "loss": 0.7026812434196472, "num_input_tokens_seen": 0, "step": 6 }, { "epoch": 0.08235294117647059, "grad_norm": 3.1470952265114955, "learning_rate": 6.363636363636364e-06, "loss": 0.7064052224159241, "num_input_tokens_seen": 0, "step": 7 }, { "epoch": 0.09411764705882353, "grad_norm": 2.1537924186137105, "learning_rate": 7.272727272727273e-06, "loss": 0.5964475274085999, "num_input_tokens_seen": 0, "step": 8 }, { "epoch": 0.10588235294117647, "grad_norm": 2.1368649029511326, "learning_rate": 8.181818181818183e-06, "loss": 0.6179602146148682, "num_input_tokens_seen": 0, "step": 9 }, { "epoch": 0.11764705882352941, "grad_norm": 1.981711432431228, "learning_rate": 9.090909090909091e-06, "loss": 0.5555359125137329, "num_input_tokens_seen": 0, "step": 10 }, { "epoch": 0.12941176470588237, "grad_norm": 1.559928530743653, "learning_rate": 1e-05, "loss": 0.5162748694419861, "num_input_tokens_seen": 0, "step": 11 }, { "epoch": 0.1411764705882353, "grad_norm": 1.4191567801273215, "learning_rate": 1.0909090909090909e-05, "loss": 0.4931896924972534, "num_input_tokens_seen": 0, "step": 12 }, { "epoch": 0.15294117647058825, "grad_norm": 1.2550232555964334, "learning_rate": 1.181818181818182e-05, "loss": 0.5178971886634827, "num_input_tokens_seen": 0, "step": 13 }, { "epoch": 0.16470588235294117, "grad_norm": 1.5353572118512469, "learning_rate": 1.2727272727272728e-05, "loss": 0.4872450530529022, "num_input_tokens_seen": 0, "step": 14 }, { "epoch": 0.17647058823529413, "grad_norm": 1.3077407975869617, "learning_rate": 1.3636363636363637e-05, "loss": 0.4353964030742645, "num_input_tokens_seen": 0, "step": 15 }, { "epoch": 0.18823529411764706, "grad_norm": 1.2379322286899677, "learning_rate": 1.4545454545454546e-05, "loss": 0.4468710124492645, "num_input_tokens_seen": 0, "step": 16 }, { "epoch": 0.2, "grad_norm": 0.9822538115083431, "learning_rate": 1.5454545454545454e-05, "loss": 0.4612148106098175, "num_input_tokens_seen": 0, "step": 17 }, { "epoch": 0.21176470588235294, "grad_norm": 1.019398009956526, "learning_rate": 1.6363636363636366e-05, "loss": 0.4416177570819855, "num_input_tokens_seen": 0, "step": 18 }, { "epoch": 0.2235294117647059, "grad_norm": 0.6811851291872145, "learning_rate": 1.7272727272727274e-05, "loss": 0.4400823712348938, "num_input_tokens_seen": 0, "step": 19 }, { "epoch": 0.23529411764705882, "grad_norm": 0.5706281411245361, "learning_rate": 1.8181818181818182e-05, "loss": 0.407284140586853, "num_input_tokens_seen": 0, "step": 20 }, { "epoch": 0.24705882352941178, "grad_norm": 0.5144726641840578, "learning_rate": 1.9090909090909094e-05, "loss": 0.40131646394729614, "num_input_tokens_seen": 0, "step": 21 }, { "epoch": 0.25882352941176473, "grad_norm": 0.5271151443718087, "learning_rate": 2e-05, "loss": 0.3690889775753021, "num_input_tokens_seen": 0, "step": 22 }, { "epoch": 0.27058823529411763, "grad_norm": 0.46867880268180473, "learning_rate": 1.999969615124717e-05, "loss": 0.38862237334251404, "num_input_tokens_seen": 0, "step": 23 }, { "epoch": 0.2823529411764706, "grad_norm": 0.44956811034445643, "learning_rate": 1.9998784623453477e-05, "loss": 0.38012465834617615, "num_input_tokens_seen": 0, "step": 24 }, { "epoch": 0.29411764705882354, "grad_norm": 0.3973152538462978, "learning_rate": 1.9997265472012247e-05, "loss": 0.3671787679195404, "num_input_tokens_seen": 0, "step": 25 }, { "epoch": 0.3058823529411765, "grad_norm": 0.44092842775695007, "learning_rate": 1.999513878924193e-05, "loss": 0.40712660551071167, "num_input_tokens_seen": 0, "step": 26 }, { "epoch": 0.3176470588235294, "grad_norm": 0.48371948306233614, "learning_rate": 1.9992404704380513e-05, "loss": 0.37348443269729614, "num_input_tokens_seen": 0, "step": 27 }, { "epoch": 0.32941176470588235, "grad_norm": 0.3835238779084052, "learning_rate": 1.9989063383577644e-05, "loss": 0.36719316244125366, "num_input_tokens_seen": 0, "step": 28 }, { "epoch": 0.3411764705882353, "grad_norm": 0.4114683262445794, "learning_rate": 1.9985115029884556e-05, "loss": 0.3744957447052002, "num_input_tokens_seen": 0, "step": 29 }, { "epoch": 0.35294117647058826, "grad_norm": 0.3926833772366512, "learning_rate": 1.9980559883241723e-05, "loss": 0.36837196350097656, "num_input_tokens_seen": 0, "step": 30 }, { "epoch": 0.36470588235294116, "grad_norm": 0.34510721456392296, "learning_rate": 1.9975398220464268e-05, "loss": 0.35771483182907104, "num_input_tokens_seen": 0, "step": 31 }, { "epoch": 0.3764705882352941, "grad_norm": 0.2906065104717026, "learning_rate": 1.996963035522515e-05, "loss": 0.3718343675136566, "num_input_tokens_seen": 0, "step": 32 }, { "epoch": 0.38823529411764707, "grad_norm": 0.28680989428590703, "learning_rate": 1.99632566380361e-05, "loss": 0.35502949357032776, "num_input_tokens_seen": 0, "step": 33 }, { "epoch": 0.4, "grad_norm": 0.36479584489715183, "learning_rate": 1.995627745622632e-05, "loss": 0.3561074733734131, "num_input_tokens_seen": 0, "step": 34 }, { "epoch": 0.4117647058823529, "grad_norm": 0.3233890688969203, "learning_rate": 1.994869323391895e-05, "loss": 0.36625605821609497, "num_input_tokens_seen": 0, "step": 35 }, { "epoch": 0.4235294117647059, "grad_norm": 0.3014081868844546, "learning_rate": 1.9940504432005293e-05, "loss": 0.32261648774147034, "num_input_tokens_seen": 0, "step": 36 }, { "epoch": 0.43529411764705883, "grad_norm": 0.29988977041893033, "learning_rate": 1.993171154811679e-05, "loss": 0.36725232005119324, "num_input_tokens_seen": 0, "step": 37 }, { "epoch": 0.4470588235294118, "grad_norm": 0.2843556547007244, "learning_rate": 1.992231511659481e-05, "loss": 0.3372398316860199, "num_input_tokens_seen": 0, "step": 38 }, { "epoch": 0.4588235294117647, "grad_norm": 0.25146289166175934, "learning_rate": 1.9912315708458144e-05, "loss": 0.35911282896995544, "num_input_tokens_seen": 0, "step": 39 }, { "epoch": 0.47058823529411764, "grad_norm": 0.25109365889274177, "learning_rate": 1.9901713931368333e-05, "loss": 0.35379254817962646, "num_input_tokens_seen": 0, "step": 40 }, { "epoch": 0.4823529411764706, "grad_norm": 0.25470949144259586, "learning_rate": 1.989051042959273e-05, "loss": 0.34498846530914307, "num_input_tokens_seen": 0, "step": 41 }, { "epoch": 0.49411764705882355, "grad_norm": 0.2603263992351511, "learning_rate": 1.9878705883965342e-05, "loss": 0.346971720457077, "num_input_tokens_seen": 0, "step": 42 }, { "epoch": 0.5058823529411764, "grad_norm": 0.2837708452421679, "learning_rate": 1.986630101184546e-05, "loss": 0.3518391251564026, "num_input_tokens_seen": 0, "step": 43 }, { "epoch": 0.5176470588235295, "grad_norm": 0.2690921713644025, "learning_rate": 1.9853296567074075e-05, "loss": 0.3417142331600189, "num_input_tokens_seen": 0, "step": 44 }, { "epoch": 0.5294117647058824, "grad_norm": 0.27838893675816295, "learning_rate": 1.983969333992804e-05, "loss": 0.33975788950920105, "num_input_tokens_seen": 0, "step": 45 }, { "epoch": 0.5411764705882353, "grad_norm": 0.2500924041093808, "learning_rate": 1.982549215707209e-05, "loss": 0.3427805006504059, "num_input_tokens_seen": 0, "step": 46 }, { "epoch": 0.5529411764705883, "grad_norm": 0.2515785105886048, "learning_rate": 1.9810693881508548e-05, "loss": 0.34949395060539246, "num_input_tokens_seen": 0, "step": 47 }, { "epoch": 0.5647058823529412, "grad_norm": 0.29781103489548, "learning_rate": 1.9795299412524948e-05, "loss": 0.34314972162246704, "num_input_tokens_seen": 0, "step": 48 }, { "epoch": 0.5764705882352941, "grad_norm": 0.2938887318454496, "learning_rate": 1.9779309685639317e-05, "loss": 0.3414318263530731, "num_input_tokens_seen": 0, "step": 49 }, { "epoch": 0.5882352941176471, "grad_norm": 0.2895540362722232, "learning_rate": 1.9762725672543372e-05, "loss": 0.3192686140537262, "num_input_tokens_seen": 0, "step": 50 }, { "epoch": 0.6, "grad_norm": 0.23066275708054598, "learning_rate": 1.9745548381043454e-05, "loss": 0.3385634422302246, "num_input_tokens_seen": 0, "step": 51 }, { "epoch": 0.611764705882353, "grad_norm": 0.280246896213228, "learning_rate": 1.9727778854999283e-05, "loss": 0.35149312019348145, "num_input_tokens_seen": 0, "step": 52 }, { "epoch": 0.6235294117647059, "grad_norm": 0.27445702417204193, "learning_rate": 1.9709418174260523e-05, "loss": 0.3358836770057678, "num_input_tokens_seen": 0, "step": 53 }, { "epoch": 0.6352941176470588, "grad_norm": 0.27123307051788814, "learning_rate": 1.969046745460116e-05, "loss": 0.35038888454437256, "num_input_tokens_seen": 0, "step": 54 }, { "epoch": 0.6470588235294118, "grad_norm": 0.3022409414703534, "learning_rate": 1.9670927847651707e-05, "loss": 0.3619537651538849, "num_input_tokens_seen": 0, "step": 55 }, { "epoch": 0.6588235294117647, "grad_norm": 0.28043980426354626, "learning_rate": 1.9650800540829204e-05, "loss": 0.334235817193985, "num_input_tokens_seen": 0, "step": 56 }, { "epoch": 0.6705882352941176, "grad_norm": 0.2608407829948446, "learning_rate": 1.963008675726506e-05, "loss": 0.3367481827735901, "num_input_tokens_seen": 0, "step": 57 }, { "epoch": 0.6823529411764706, "grad_norm": 0.28536414145460753, "learning_rate": 1.9608787755730746e-05, "loss": 0.3296854496002197, "num_input_tokens_seen": 0, "step": 58 }, { "epoch": 0.6941176470588235, "grad_norm": 0.27980621624734936, "learning_rate": 1.958690483056126e-05, "loss": 0.32561179995536804, "num_input_tokens_seen": 0, "step": 59 }, { "epoch": 0.7058823529411765, "grad_norm": 0.2424680758848498, "learning_rate": 1.9564439311576515e-05, "loss": 0.33346784114837646, "num_input_tokens_seen": 0, "step": 60 }, { "epoch": 0.7176470588235294, "grad_norm": 0.24779814083785615, "learning_rate": 1.954139256400049e-05, "loss": 0.34621721506118774, "num_input_tokens_seen": 0, "step": 61 }, { "epoch": 0.7294117647058823, "grad_norm": 0.28660266256207545, "learning_rate": 1.951776598837829e-05, "loss": 0.31782716512680054, "num_input_tokens_seen": 0, "step": 62 }, { "epoch": 0.7411764705882353, "grad_norm": 0.2628734068408129, "learning_rate": 1.9493561020491024e-05, "loss": 0.3253316283226013, "num_input_tokens_seen": 0, "step": 63 }, { "epoch": 0.7529411764705882, "grad_norm": 0.253172171843236, "learning_rate": 1.9468779131268553e-05, "loss": 0.32543760538101196, "num_input_tokens_seen": 0, "step": 64 }, { "epoch": 0.7647058823529411, "grad_norm": 0.28753705660744233, "learning_rate": 1.9443421826700096e-05, "loss": 0.32660526037216187, "num_input_tokens_seen": 0, "step": 65 }, { "epoch": 0.7764705882352941, "grad_norm": 0.2550674488664895, "learning_rate": 1.9417490647742738e-05, "loss": 0.29762235283851624, "num_input_tokens_seen": 0, "step": 66 }, { "epoch": 0.788235294117647, "grad_norm": 0.2387936654216341, "learning_rate": 1.9390987170227746e-05, "loss": 0.34908509254455566, "num_input_tokens_seen": 0, "step": 67 }, { "epoch": 0.8, "grad_norm": 0.28736279751275334, "learning_rate": 1.9363913004764847e-05, "loss": 0.3289881944656372, "num_input_tokens_seen": 0, "step": 68 }, { "epoch": 0.8117647058823529, "grad_norm": 0.2857068101908039, "learning_rate": 1.9336269796644314e-05, "loss": 0.316879540681839, "num_input_tokens_seen": 0, "step": 69 }, { "epoch": 0.8235294117647058, "grad_norm": 0.262248761213064, "learning_rate": 1.9308059225737015e-05, "loss": 0.3085065484046936, "num_input_tokens_seen": 0, "step": 70 }, { "epoch": 0.8352941176470589, "grad_norm": 0.27771726352284015, "learning_rate": 1.9279283006392304e-05, "loss": 0.3186359405517578, "num_input_tokens_seen": 0, "step": 71 }, { "epoch": 0.8470588235294118, "grad_norm": 0.31802345881089383, "learning_rate": 1.924994288733386e-05, "loss": 0.329565167427063, "num_input_tokens_seen": 0, "step": 72 }, { "epoch": 0.8588235294117647, "grad_norm": 0.28385852442224846, "learning_rate": 1.9220040651553388e-05, "loss": 0.3364284634590149, "num_input_tokens_seen": 0, "step": 73 }, { "epoch": 0.8705882352941177, "grad_norm": 0.2533928699463178, "learning_rate": 1.918957811620231e-05, "loss": 0.3229159712791443, "num_input_tokens_seen": 0, "step": 74 }, { "epoch": 0.8823529411764706, "grad_norm": 0.27551639833029534, "learning_rate": 1.915855713248129e-05, "loss": 0.317361056804657, "num_input_tokens_seen": 0, "step": 75 }, { "epoch": 0.8941176470588236, "grad_norm": 0.26990146657663827, "learning_rate": 1.912697958552778e-05, "loss": 0.31780922412872314, "num_input_tokens_seen": 0, "step": 76 }, { "epoch": 0.9058823529411765, "grad_norm": 0.3150863510764379, "learning_rate": 1.9094847394301427e-05, "loss": 0.33098268508911133, "num_input_tokens_seen": 0, "step": 77 }, { "epoch": 0.9176470588235294, "grad_norm": 0.3234901727951674, "learning_rate": 1.906216251146748e-05, "loss": 0.341233491897583, "num_input_tokens_seen": 0, "step": 78 }, { "epoch": 0.9294117647058824, "grad_norm": 0.26331326581875253, "learning_rate": 1.902892692327811e-05, "loss": 0.33283838629722595, "num_input_tokens_seen": 0, "step": 79 }, { "epoch": 0.9411764705882353, "grad_norm": 0.31786869502135223, "learning_rate": 1.899514264945173e-05, "loss": 0.3331839442253113, "num_input_tokens_seen": 0, "step": 80 }, { "epoch": 0.9529411764705882, "grad_norm": 0.29539690375673217, "learning_rate": 1.8960811743050227e-05, "loss": 0.33531326055526733, "num_input_tokens_seen": 0, "step": 81 }, { "epoch": 0.9647058823529412, "grad_norm": 0.30472122382886785, "learning_rate": 1.8925936290354224e-05, "loss": 0.3103257417678833, "num_input_tokens_seen": 0, "step": 82 }, { "epoch": 0.9764705882352941, "grad_norm": 0.2530167563030317, "learning_rate": 1.8890518410736275e-05, "loss": 0.32245466113090515, "num_input_tokens_seen": 0, "step": 83 }, { "epoch": 0.9882352941176471, "grad_norm": 0.28897856083778817, "learning_rate": 1.8854560256532098e-05, "loss": 0.3198079764842987, "num_input_tokens_seen": 0, "step": 84 }, { "epoch": 1.0, "grad_norm": 0.26450715598842334, "learning_rate": 1.8818064012909755e-05, "loss": 0.3213130235671997, "num_input_tokens_seen": 0, "step": 85 }, { "epoch": 1.011764705882353, "grad_norm": 0.2626207427201876, "learning_rate": 1.878103189773686e-05, "loss": 0.2763475179672241, "num_input_tokens_seen": 0, "step": 86 }, { "epoch": 1.0235294117647058, "grad_norm": 0.2629994874410627, "learning_rate": 1.8743466161445823e-05, "loss": 0.2665697932243347, "num_input_tokens_seen": 0, "step": 87 }, { "epoch": 1.035294117647059, "grad_norm": 0.29251013301945034, "learning_rate": 1.8705369086897063e-05, "loss": 0.2806475758552551, "num_input_tokens_seen": 0, "step": 88 }, { "epoch": 1.0470588235294118, "grad_norm": 0.2852660256104482, "learning_rate": 1.86667429892403e-05, "loss": 0.2648066282272339, "num_input_tokens_seen": 0, "step": 89 }, { "epoch": 1.0588235294117647, "grad_norm": 0.2762797160323279, "learning_rate": 1.862759021577385e-05, "loss": 0.2734478712081909, "num_input_tokens_seen": 0, "step": 90 }, { "epoch": 1.0705882352941176, "grad_norm": 0.3020506174745607, "learning_rate": 1.8587913145801998e-05, "loss": 0.2635505795478821, "num_input_tokens_seen": 0, "step": 91 }, { "epoch": 1.0823529411764705, "grad_norm": 0.28351991002826543, "learning_rate": 1.8547714190490385e-05, "loss": 0.2799134850502014, "num_input_tokens_seen": 0, "step": 92 }, { "epoch": 1.0941176470588236, "grad_norm": 0.2740110743262188, "learning_rate": 1.8506995792719498e-05, "loss": 0.2726055979728699, "num_input_tokens_seen": 0, "step": 93 }, { "epoch": 1.1058823529411765, "grad_norm": 0.30269062031162386, "learning_rate": 1.8465760426936212e-05, "loss": 0.2837594747543335, "num_input_tokens_seen": 0, "step": 94 }, { "epoch": 1.1176470588235294, "grad_norm": 0.30315593582121325, "learning_rate": 1.8424010599003424e-05, "loss": 0.2731676995754242, "num_input_tokens_seen": 0, "step": 95 }, { "epoch": 1.1294117647058823, "grad_norm": 0.26149596901353317, "learning_rate": 1.838174884604776e-05, "loss": 0.2705945372581482, "num_input_tokens_seen": 0, "step": 96 }, { "epoch": 1.1411764705882352, "grad_norm": 0.3026007790348899, "learning_rate": 1.8338977736305408e-05, "loss": 0.2789444923400879, "num_input_tokens_seen": 0, "step": 97 }, { "epoch": 1.1529411764705881, "grad_norm": 0.3349505482436329, "learning_rate": 1.8295699868966038e-05, "loss": 0.2682260572910309, "num_input_tokens_seen": 0, "step": 98 }, { "epoch": 1.1647058823529413, "grad_norm": 0.2891949628165266, "learning_rate": 1.8251917874014854e-05, "loss": 0.28042054176330566, "num_input_tokens_seen": 0, "step": 99 }, { "epoch": 1.1764705882352942, "grad_norm": 0.2941393562882544, "learning_rate": 1.8207634412072765e-05, "loss": 0.25862598419189453, "num_input_tokens_seen": 0, "step": 100 }, { "epoch": 1.188235294117647, "grad_norm": 0.32722595542360156, "learning_rate": 1.8162852174234712e-05, "loss": 0.2712678909301758, "num_input_tokens_seen": 0, "step": 101 }, { "epoch": 1.2, "grad_norm": 0.2755950618714099, "learning_rate": 1.8117573881906114e-05, "loss": 0.26205819845199585, "num_input_tokens_seen": 0, "step": 102 }, { "epoch": 1.2117647058823529, "grad_norm": 0.2571435526644292, "learning_rate": 1.8071802286637505e-05, "loss": 0.2622745633125305, "num_input_tokens_seen": 0, "step": 103 }, { "epoch": 1.223529411764706, "grad_norm": 0.2591172743832164, "learning_rate": 1.8025540169957315e-05, "loss": 0.25631460547447205, "num_input_tokens_seen": 0, "step": 104 }, { "epoch": 1.2352941176470589, "grad_norm": 0.2817321177900711, "learning_rate": 1.7978790343202826e-05, "loss": 0.2782523036003113, "num_input_tokens_seen": 0, "step": 105 }, { "epoch": 1.2470588235294118, "grad_norm": 0.33225523735776513, "learning_rate": 1.7931555647349358e-05, "loss": 0.2600249946117401, "num_input_tokens_seen": 0, "step": 106 }, { "epoch": 1.2588235294117647, "grad_norm": 0.2786742348476795, "learning_rate": 1.7883838952837595e-05, "loss": 0.25568312406539917, "num_input_tokens_seen": 0, "step": 107 }, { "epoch": 1.2705882352941176, "grad_norm": 0.2771371487960206, "learning_rate": 1.7835643159399156e-05, "loss": 0.2384142279624939, "num_input_tokens_seen": 0, "step": 108 }, { "epoch": 1.2823529411764705, "grad_norm": 0.31328815588599274, "learning_rate": 1.778697119588039e-05, "loss": 0.2667343318462372, "num_input_tokens_seen": 0, "step": 109 }, { "epoch": 1.2941176470588236, "grad_norm": 0.2690014796691674, "learning_rate": 1.7737826020064377e-05, "loss": 0.2558494210243225, "num_input_tokens_seen": 0, "step": 110 }, { "epoch": 1.3058823529411765, "grad_norm": 0.3038451633685586, "learning_rate": 1.76882106184912e-05, "loss": 0.25802576541900635, "num_input_tokens_seen": 0, "step": 111 }, { "epoch": 1.3176470588235294, "grad_norm": 0.26349039262552754, "learning_rate": 1.7638128006276422e-05, "loss": 0.26081448793411255, "num_input_tokens_seen": 0, "step": 112 }, { "epoch": 1.3294117647058823, "grad_norm": 0.27581161125402026, "learning_rate": 1.758758122692791e-05, "loss": 0.27647483348846436, "num_input_tokens_seen": 0, "step": 113 }, { "epoch": 1.3411764705882354, "grad_norm": 0.3235486769428178, "learning_rate": 1.753657335216083e-05, "loss": 0.2677750587463379, "num_input_tokens_seen": 0, "step": 114 }, { "epoch": 1.3529411764705883, "grad_norm": 0.2809145367414571, "learning_rate": 1.7485107481711014e-05, "loss": 0.2682688236236572, "num_input_tokens_seen": 0, "step": 115 }, { "epoch": 1.3647058823529412, "grad_norm": 0.2619951939456424, "learning_rate": 1.743318674314656e-05, "loss": 0.25316929817199707, "num_input_tokens_seen": 0, "step": 116 }, { "epoch": 1.3764705882352941, "grad_norm": 0.27411080913366315, "learning_rate": 1.7380814291677818e-05, "loss": 0.2697577476501465, "num_input_tokens_seen": 0, "step": 117 }, { "epoch": 1.388235294117647, "grad_norm": 0.3338822677438316, "learning_rate": 1.7327993309965583e-05, "loss": 0.2708876132965088, "num_input_tokens_seen": 0, "step": 118 }, { "epoch": 1.4, "grad_norm": 0.31962282276030907, "learning_rate": 1.7274727007927747e-05, "loss": 0.27048563957214355, "num_input_tokens_seen": 0, "step": 119 }, { "epoch": 1.4117647058823528, "grad_norm": 0.285342846378909, "learning_rate": 1.7221018622544197e-05, "loss": 0.2710177004337311, "num_input_tokens_seen": 0, "step": 120 }, { "epoch": 1.423529411764706, "grad_norm": 0.308814170391406, "learning_rate": 1.7166871417660116e-05, "loss": 0.2526181936264038, "num_input_tokens_seen": 0, "step": 121 }, { "epoch": 1.4352941176470588, "grad_norm": 0.27775597890631276, "learning_rate": 1.7112288683787637e-05, "loss": 0.26763850450515747, "num_input_tokens_seen": 0, "step": 122 }, { "epoch": 1.4470588235294117, "grad_norm": 0.2958185178060128, "learning_rate": 1.7057273737905887e-05, "loss": 0.268245667219162, "num_input_tokens_seen": 0, "step": 123 }, { "epoch": 1.4588235294117646, "grad_norm": 0.2483775556217329, "learning_rate": 1.70018299232594e-05, "loss": 0.25788575410842896, "num_input_tokens_seen": 0, "step": 124 }, { "epoch": 1.4705882352941178, "grad_norm": 0.2811097779442606, "learning_rate": 1.6945960609154966e-05, "loss": 0.26732224225997925, "num_input_tokens_seen": 0, "step": 125 }, { "epoch": 1.4823529411764707, "grad_norm": 0.2934299916938348, "learning_rate": 1.688966919075687e-05, "loss": 0.26281166076660156, "num_input_tokens_seen": 0, "step": 126 }, { "epoch": 1.4941176470588236, "grad_norm": 0.2368134963295287, "learning_rate": 1.6832959088880557e-05, "loss": 0.25862863659858704, "num_input_tokens_seen": 0, "step": 127 }, { "epoch": 1.5058823529411764, "grad_norm": 0.2708304514650526, "learning_rate": 1.677583374978478e-05, "loss": 0.2421874701976776, "num_input_tokens_seen": 0, "step": 128 }, { "epoch": 1.5176470588235293, "grad_norm": 0.2767120423486198, "learning_rate": 1.6718296644962146e-05, "loss": 0.2624642550945282, "num_input_tokens_seen": 0, "step": 129 }, { "epoch": 1.5294117647058822, "grad_norm": 0.2813174470652987, "learning_rate": 1.6660351270928164e-05, "loss": 0.24937519431114197, "num_input_tokens_seen": 0, "step": 130 }, { "epoch": 1.5411764705882351, "grad_norm": 0.3009488397968105, "learning_rate": 1.660200114900876e-05, "loss": 0.2704227566719055, "num_input_tokens_seen": 0, "step": 131 }, { "epoch": 1.5529411764705883, "grad_norm": 0.3141059797795813, "learning_rate": 1.6543249825126285e-05, "loss": 0.26932939887046814, "num_input_tokens_seen": 0, "step": 132 }, { "epoch": 1.5647058823529412, "grad_norm": 0.25053717473426707, "learning_rate": 1.6484100869584044e-05, "loss": 0.2592698633670807, "num_input_tokens_seen": 0, "step": 133 }, { "epoch": 1.576470588235294, "grad_norm": 0.25700597213890997, "learning_rate": 1.6424557876849308e-05, "loss": 0.27053964138031006, "num_input_tokens_seen": 0, "step": 134 }, { "epoch": 1.5882352941176472, "grad_norm": 0.30182930329649144, "learning_rate": 1.636462446533489e-05, "loss": 0.25989019870758057, "num_input_tokens_seen": 0, "step": 135 }, { "epoch": 1.6, "grad_norm": 0.26390881674937633, "learning_rate": 1.6304304277179267e-05, "loss": 0.2570236027240753, "num_input_tokens_seen": 0, "step": 136 }, { "epoch": 1.611764705882353, "grad_norm": 0.2652947312714827, "learning_rate": 1.6243600978025215e-05, "loss": 0.2678568363189697, "num_input_tokens_seen": 0, "step": 137 }, { "epoch": 1.6235294117647059, "grad_norm": 0.2575940385752971, "learning_rate": 1.6182518256797095e-05, "loss": 0.2600210905075073, "num_input_tokens_seen": 0, "step": 138 }, { "epoch": 1.6352941176470588, "grad_norm": 0.2610590842320019, "learning_rate": 1.612105982547663e-05, "loss": 0.26671087741851807, "num_input_tokens_seen": 0, "step": 139 }, { "epoch": 1.6470588235294117, "grad_norm": 0.25464302295329627, "learning_rate": 1.605922941887737e-05, "loss": 0.2668280005455017, "num_input_tokens_seen": 0, "step": 140 }, { "epoch": 1.6588235294117646, "grad_norm": 0.26069231826980477, "learning_rate": 1.599703079441769e-05, "loss": 0.2653328478336334, "num_input_tokens_seen": 0, "step": 141 }, { "epoch": 1.6705882352941175, "grad_norm": 0.27072482250492486, "learning_rate": 1.5934467731892497e-05, "loss": 0.2632245719432831, "num_input_tokens_seen": 0, "step": 142 }, { "epoch": 1.6823529411764706, "grad_norm": 0.24138888757547514, "learning_rate": 1.5871544033243488e-05, "loss": 0.26093634963035583, "num_input_tokens_seen": 0, "step": 143 }, { "epoch": 1.6941176470588235, "grad_norm": 0.25857892670146815, "learning_rate": 1.5808263522328137e-05, "loss": 0.2518957853317261, "num_input_tokens_seen": 0, "step": 144 }, { "epoch": 1.7058823529411766, "grad_norm": 0.25322801625227936, "learning_rate": 1.5744630044687307e-05, "loss": 0.25198179483413696, "num_input_tokens_seen": 0, "step": 145 }, { "epoch": 1.7176470588235295, "grad_norm": 0.23398219863607192, "learning_rate": 1.568064746731156e-05, "loss": 0.25039592385292053, "num_input_tokens_seen": 0, "step": 146 }, { "epoch": 1.7294117647058824, "grad_norm": 0.22752785226042835, "learning_rate": 1.561631967840617e-05, "loss": 0.25004899501800537, "num_input_tokens_seen": 0, "step": 147 }, { "epoch": 1.7411764705882353, "grad_norm": 0.26867363858385673, "learning_rate": 1.5551650587154815e-05, "loss": 0.2628065347671509, "num_input_tokens_seen": 0, "step": 148 }, { "epoch": 1.7529411764705882, "grad_norm": 0.2572214700469002, "learning_rate": 1.5486644123482047e-05, "loss": 0.2694377899169922, "num_input_tokens_seen": 0, "step": 149 }, { "epoch": 1.7647058823529411, "grad_norm": 0.2649069012394484, "learning_rate": 1.542130423781444e-05, "loss": 0.2698570787906647, "num_input_tokens_seen": 0, "step": 150 }, { "epoch": 1.776470588235294, "grad_norm": 0.3129557276746984, "learning_rate": 1.5355634900840558e-05, "loss": 0.2620123624801636, "num_input_tokens_seen": 0, "step": 151 }, { "epoch": 1.788235294117647, "grad_norm": 0.219414643912218, "learning_rate": 1.5289640103269626e-05, "loss": 0.24250832200050354, "num_input_tokens_seen": 0, "step": 152 }, { "epoch": 1.8, "grad_norm": 0.2787522458312503, "learning_rate": 1.5223323855589027e-05, "loss": 0.2599625885486603, "num_input_tokens_seen": 0, "step": 153 }, { "epoch": 1.811764705882353, "grad_norm": 0.24624844789559322, "learning_rate": 1.5156690187820596e-05, "loss": 0.2539859712123871, "num_input_tokens_seen": 0, "step": 154 }, { "epoch": 1.8235294117647058, "grad_norm": 0.28786535612403885, "learning_rate": 1.50897431492757e-05, "loss": 0.251323938369751, "num_input_tokens_seen": 0, "step": 155 }, { "epoch": 1.835294117647059, "grad_norm": 0.2861446800798861, "learning_rate": 1.5022486808309171e-05, "loss": 0.2852325439453125, "num_input_tokens_seen": 0, "step": 156 }, { "epoch": 1.8470588235294119, "grad_norm": 0.30835997118524755, "learning_rate": 1.4954925252072077e-05, "loss": 0.2626144289970398, "num_input_tokens_seen": 0, "step": 157 }, { "epoch": 1.8588235294117648, "grad_norm": 0.27166093756727683, "learning_rate": 1.4887062586263334e-05, "loss": 0.26250118017196655, "num_input_tokens_seen": 0, "step": 158 }, { "epoch": 1.8705882352941177, "grad_norm": 0.30818985316404857, "learning_rate": 1.4818902934880222e-05, "loss": 0.27699387073516846, "num_input_tokens_seen": 0, "step": 159 }, { "epoch": 1.8823529411764706, "grad_norm": 0.30205479197808555, "learning_rate": 1.4750450439967751e-05, "loss": 0.272649347782135, "num_input_tokens_seen": 0, "step": 160 }, { "epoch": 1.8941176470588235, "grad_norm": 0.29949042144033816, "learning_rate": 1.4681709261366963e-05, "loss": 0.2485789656639099, "num_input_tokens_seen": 0, "step": 161 }, { "epoch": 1.9058823529411764, "grad_norm": 0.267903631477539, "learning_rate": 1.4612683576462135e-05, "loss": 0.2616223096847534, "num_input_tokens_seen": 0, "step": 162 }, { "epoch": 1.9176470588235293, "grad_norm": 0.27260315220708237, "learning_rate": 1.4543377579926915e-05, "loss": 0.27286335825920105, "num_input_tokens_seen": 0, "step": 163 }, { "epoch": 1.9294117647058824, "grad_norm": 0.28592302424298965, "learning_rate": 1.4473795483469442e-05, "loss": 0.24860531091690063, "num_input_tokens_seen": 0, "step": 164 }, { "epoch": 1.9411764705882353, "grad_norm": 0.27067444548694936, "learning_rate": 1.4403941515576344e-05, "loss": 0.2611614167690277, "num_input_tokens_seen": 0, "step": 165 }, { "epoch": 1.9529411764705882, "grad_norm": 0.26432408877050523, "learning_rate": 1.4333819921255836e-05, "loss": 0.26266223192214966, "num_input_tokens_seen": 0, "step": 166 }, { "epoch": 1.9647058823529413, "grad_norm": 0.32069387585361836, "learning_rate": 1.4263434961779709e-05, "loss": 0.24890068173408508, "num_input_tokens_seen": 0, "step": 167 }, { "epoch": 1.9764705882352942, "grad_norm": 0.28968277975368684, "learning_rate": 1.41927909144244e-05, "loss": 0.2612011432647705, "num_input_tokens_seen": 0, "step": 168 }, { "epoch": 1.988235294117647, "grad_norm": 0.2593706365289158, "learning_rate": 1.412189207221104e-05, "loss": 0.24890106916427612, "num_input_tokens_seen": 0, "step": 169 }, { "epoch": 2.0, "grad_norm": 0.25908450639554936, "learning_rate": 1.4050742743644588e-05, "loss": 0.25550538301467896, "num_input_tokens_seen": 0, "step": 170 }, { "epoch": 2.011764705882353, "grad_norm": 0.32606044201267254, "learning_rate": 1.3979347252451994e-05, "loss": 0.20405685901641846, "num_input_tokens_seen": 0, "step": 171 }, { "epoch": 2.023529411764706, "grad_norm": 0.31532835367496725, "learning_rate": 1.3907709937319451e-05, "loss": 0.2080579251050949, "num_input_tokens_seen": 0, "step": 172 }, { "epoch": 2.0352941176470587, "grad_norm": 0.23106550000023307, "learning_rate": 1.3835835151628728e-05, "loss": 0.1862945556640625, "num_input_tokens_seen": 0, "step": 173 }, { "epoch": 2.0470588235294116, "grad_norm": 0.2399759682184491, "learning_rate": 1.3763727263192626e-05, "loss": 0.18684154748916626, "num_input_tokens_seen": 0, "step": 174 }, { "epoch": 2.0588235294117645, "grad_norm": 0.24298409208730917, "learning_rate": 1.3691390653989536e-05, "loss": 0.19205346703529358, "num_input_tokens_seen": 0, "step": 175 }, { "epoch": 2.070588235294118, "grad_norm": 0.2606890104298591, "learning_rate": 1.3618829719897158e-05, "loss": 0.19722914695739746, "num_input_tokens_seen": 0, "step": 176 }, { "epoch": 2.0823529411764707, "grad_norm": 0.2634006155067239, "learning_rate": 1.3546048870425356e-05, "loss": 0.18658706545829773, "num_input_tokens_seen": 0, "step": 177 }, { "epoch": 2.0941176470588236, "grad_norm": 0.2474551016529151, "learning_rate": 1.3473052528448203e-05, "loss": 0.18761307001113892, "num_input_tokens_seen": 0, "step": 178 }, { "epoch": 2.1058823529411765, "grad_norm": 0.2773501459528279, "learning_rate": 1.3399845129935191e-05, "loss": 0.2006130963563919, "num_input_tokens_seen": 0, "step": 179 }, { "epoch": 2.1176470588235294, "grad_norm": 0.24768518968840073, "learning_rate": 1.3326431123681667e-05, "loss": 0.1869545876979828, "num_input_tokens_seen": 0, "step": 180 }, { "epoch": 2.1294117647058823, "grad_norm": 0.24087563849344726, "learning_rate": 1.3252814971038477e-05, "loss": 0.19419728219509125, "num_input_tokens_seen": 0, "step": 181 }, { "epoch": 2.1411764705882352, "grad_norm": 0.24859116981429222, "learning_rate": 1.3179001145640856e-05, "loss": 0.1937357634305954, "num_input_tokens_seen": 0, "step": 182 }, { "epoch": 2.152941176470588, "grad_norm": 0.2513377458414818, "learning_rate": 1.3104994133136563e-05, "loss": 0.18806332349777222, "num_input_tokens_seen": 0, "step": 183 }, { "epoch": 2.164705882352941, "grad_norm": 0.24195612774749747, "learning_rate": 1.3030798430913289e-05, "loss": 0.19312450289726257, "num_input_tokens_seen": 0, "step": 184 }, { "epoch": 2.176470588235294, "grad_norm": 0.2598954308224352, "learning_rate": 1.295641854782535e-05, "loss": 0.19178995490074158, "num_input_tokens_seen": 0, "step": 185 }, { "epoch": 2.1882352941176473, "grad_norm": 0.2738424910649441, "learning_rate": 1.2881859003919688e-05, "loss": 0.19293949007987976, "num_input_tokens_seen": 0, "step": 186 }, { "epoch": 2.2, "grad_norm": 0.24146821641260552, "learning_rate": 1.2807124330161188e-05, "loss": 0.18528440594673157, "num_input_tokens_seen": 0, "step": 187 }, { "epoch": 2.211764705882353, "grad_norm": 0.257111381442425, "learning_rate": 1.2732219068157335e-05, "loss": 0.18848256766796112, "num_input_tokens_seen": 0, "step": 188 }, { "epoch": 2.223529411764706, "grad_norm": 0.2526409622347608, "learning_rate": 1.2657147769882215e-05, "loss": 0.18127834796905518, "num_input_tokens_seen": 0, "step": 189 }, { "epoch": 2.235294117647059, "grad_norm": 0.23701529976763616, "learning_rate": 1.2581914997399899e-05, "loss": 0.18892061710357666, "num_input_tokens_seen": 0, "step": 190 }, { "epoch": 2.2470588235294118, "grad_norm": 0.24297086363023263, "learning_rate": 1.2506525322587207e-05, "loss": 0.19873817265033722, "num_input_tokens_seen": 0, "step": 191 }, { "epoch": 2.2588235294117647, "grad_norm": 0.2537032696104157, "learning_rate": 1.2430983326855873e-05, "loss": 0.1893860250711441, "num_input_tokens_seen": 0, "step": 192 }, { "epoch": 2.2705882352941176, "grad_norm": 0.23876942589975814, "learning_rate": 1.2355293600874132e-05, "loss": 0.18759432435035706, "num_input_tokens_seen": 0, "step": 193 }, { "epoch": 2.2823529411764705, "grad_norm": 0.2435388542806445, "learning_rate": 1.2279460744287755e-05, "loss": 0.18849223852157593, "num_input_tokens_seen": 0, "step": 194 }, { "epoch": 2.2941176470588234, "grad_norm": 0.2647343889775541, "learning_rate": 1.220348936544052e-05, "loss": 0.18661049008369446, "num_input_tokens_seen": 0, "step": 195 }, { "epoch": 2.3058823529411763, "grad_norm": 0.25540155279573523, "learning_rate": 1.2127384081094167e-05, "loss": 0.18517085909843445, "num_input_tokens_seen": 0, "step": 196 }, { "epoch": 2.317647058823529, "grad_norm": 0.24552318557540526, "learning_rate": 1.205114951614785e-05, "loss": 0.17878204584121704, "num_input_tokens_seen": 0, "step": 197 }, { "epoch": 2.3294117647058825, "grad_norm": 0.2258935926658077, "learning_rate": 1.197479030335706e-05, "loss": 0.18578067421913147, "num_input_tokens_seen": 0, "step": 198 }, { "epoch": 2.3411764705882354, "grad_norm": 0.22583777859137, "learning_rate": 1.1898311083052113e-05, "loss": 0.19397635757923126, "num_input_tokens_seen": 0, "step": 199 }, { "epoch": 2.3529411764705883, "grad_norm": 0.23201542489820412, "learning_rate": 1.1821716502856154e-05, "loss": 0.18146567046642303, "num_input_tokens_seen": 0, "step": 200 }, { "epoch": 2.364705882352941, "grad_norm": 0.28552700838642453, "learning_rate": 1.1745011217402709e-05, "loss": 0.19469541311264038, "num_input_tokens_seen": 0, "step": 201 }, { "epoch": 2.376470588235294, "grad_norm": 0.24910488131854605, "learning_rate": 1.1668199888052844e-05, "loss": 0.18924464285373688, "num_input_tokens_seen": 0, "step": 202 }, { "epoch": 2.388235294117647, "grad_norm": 0.24952841695443162, "learning_rate": 1.159128718261189e-05, "loss": 0.18815085291862488, "num_input_tokens_seen": 0, "step": 203 }, { "epoch": 2.4, "grad_norm": 0.22629712220582293, "learning_rate": 1.1514277775045768e-05, "loss": 0.18509158492088318, "num_input_tokens_seen": 0, "step": 204 }, { "epoch": 2.411764705882353, "grad_norm": 0.2258797767600323, "learning_rate": 1.1437176345196967e-05, "loss": 0.17601992189884186, "num_input_tokens_seen": 0, "step": 205 }, { "epoch": 2.4235294117647057, "grad_norm": 0.2994549469629298, "learning_rate": 1.135998757850015e-05, "loss": 0.19033361971378326, "num_input_tokens_seen": 0, "step": 206 }, { "epoch": 2.435294117647059, "grad_norm": 0.28669793445051134, "learning_rate": 1.128271616569741e-05, "loss": 0.19659247994422913, "num_input_tokens_seen": 0, "step": 207 }, { "epoch": 2.447058823529412, "grad_norm": 0.24321969874326846, "learning_rate": 1.1205366802553231e-05, "loss": 0.189006507396698, "num_input_tokens_seen": 0, "step": 208 }, { "epoch": 2.458823529411765, "grad_norm": 0.23277687621799142, "learning_rate": 1.1127944189569122e-05, "loss": 0.18315881490707397, "num_input_tokens_seen": 0, "step": 209 }, { "epoch": 2.4705882352941178, "grad_norm": 0.24644185758060683, "learning_rate": 1.1050453031697958e-05, "loss": 0.18082918226718903, "num_input_tokens_seen": 0, "step": 210 }, { "epoch": 2.4823529411764707, "grad_norm": 0.27537652887423003, "learning_rate": 1.0972898038058077e-05, "loss": 0.18804597854614258, "num_input_tokens_seen": 0, "step": 211 }, { "epoch": 2.4941176470588236, "grad_norm": 0.22999355280888956, "learning_rate": 1.0895283921647098e-05, "loss": 0.18512041866779327, "num_input_tokens_seen": 0, "step": 212 }, { "epoch": 2.5058823529411764, "grad_norm": 0.24328460263907906, "learning_rate": 1.0817615399055513e-05, "loss": 0.18306857347488403, "num_input_tokens_seen": 0, "step": 213 }, { "epoch": 2.5176470588235293, "grad_norm": 0.24353741537161722, "learning_rate": 1.0739897190180066e-05, "loss": 0.18730933964252472, "num_input_tokens_seen": 0, "step": 214 }, { "epoch": 2.5294117647058822, "grad_norm": 0.24168464720218039, "learning_rate": 1.0662134017936924e-05, "loss": 0.1890895515680313, "num_input_tokens_seen": 0, "step": 215 }, { "epoch": 2.541176470588235, "grad_norm": 0.2464118373551017, "learning_rate": 1.0584330607974673e-05, "loss": 0.1896791309118271, "num_input_tokens_seen": 0, "step": 216 }, { "epoch": 2.552941176470588, "grad_norm": 0.2272154213017855, "learning_rate": 1.0506491688387128e-05, "loss": 0.187567800283432, "num_input_tokens_seen": 0, "step": 217 }, { "epoch": 2.564705882352941, "grad_norm": 0.23687499350154168, "learning_rate": 1.0428621989426016e-05, "loss": 0.19160117208957672, "num_input_tokens_seen": 0, "step": 218 }, { "epoch": 2.576470588235294, "grad_norm": 0.23229299934050784, "learning_rate": 1.0350726243213519e-05, "loss": 0.18402451276779175, "num_input_tokens_seen": 0, "step": 219 }, { "epoch": 2.588235294117647, "grad_norm": 0.2515873476821987, "learning_rate": 1.0272809183454701e-05, "loss": 0.18722085654735565, "num_input_tokens_seen": 0, "step": 220 }, { "epoch": 2.6, "grad_norm": 0.22755973648814593, "learning_rate": 1.0194875545149854e-05, "loss": 0.18111610412597656, "num_input_tokens_seen": 0, "step": 221 }, { "epoch": 2.611764705882353, "grad_norm": 0.23007823552128587, "learning_rate": 1.0116930064306736e-05, "loss": 0.19812649488449097, "num_input_tokens_seen": 0, "step": 222 }, { "epoch": 2.623529411764706, "grad_norm": 0.22528651243150996, "learning_rate": 1.0038977477652779e-05, "loss": 0.18580538034439087, "num_input_tokens_seen": 0, "step": 223 }, { "epoch": 2.635294117647059, "grad_norm": 0.23539863556511334, "learning_rate": 9.961022522347226e-06, "loss": 0.18501965701580048, "num_input_tokens_seen": 0, "step": 224 }, { "epoch": 2.6470588235294117, "grad_norm": 0.22782974012346754, "learning_rate": 9.883069935693267e-06, "loss": 0.18402716517448425, "num_input_tokens_seen": 0, "step": 225 }, { "epoch": 2.6588235294117646, "grad_norm": 0.24487953714591462, "learning_rate": 9.80512445485015e-06, "loss": 0.18938913941383362, "num_input_tokens_seen": 0, "step": 226 }, { "epoch": 2.6705882352941175, "grad_norm": 0.24462792166495934, "learning_rate": 9.7271908165453e-06, "loss": 0.19719335436820984, "num_input_tokens_seen": 0, "step": 227 }, { "epoch": 2.682352941176471, "grad_norm": 0.2366728459616901, "learning_rate": 9.649273756786486e-06, "loss": 0.185680091381073, "num_input_tokens_seen": 0, "step": 228 }, { "epoch": 2.6941176470588237, "grad_norm": 0.2303882056729561, "learning_rate": 9.57137801057399e-06, "loss": 0.19624218344688416, "num_input_tokens_seen": 0, "step": 229 }, { "epoch": 2.7058823529411766, "grad_norm": 0.22987803077687444, "learning_rate": 9.493508311612874e-06, "loss": 0.17861570417881012, "num_input_tokens_seen": 0, "step": 230 }, { "epoch": 2.7176470588235295, "grad_norm": 0.26388168681073687, "learning_rate": 9.415669392025329e-06, "loss": 0.18734458088874817, "num_input_tokens_seen": 0, "step": 231 }, { "epoch": 2.7294117647058824, "grad_norm": 0.22186631357859773, "learning_rate": 9.337865982063076e-06, "loss": 0.1946583092212677, "num_input_tokens_seen": 0, "step": 232 }, { "epoch": 2.7411764705882353, "grad_norm": 0.2311568846601055, "learning_rate": 9.260102809819939e-06, "loss": 0.18761436641216278, "num_input_tokens_seen": 0, "step": 233 }, { "epoch": 2.7529411764705882, "grad_norm": 0.22628859572679205, "learning_rate": 9.182384600944494e-06, "loss": 0.18877655267715454, "num_input_tokens_seen": 0, "step": 234 }, { "epoch": 2.764705882352941, "grad_norm": 0.24528368812451035, "learning_rate": 9.104716078352906e-06, "loss": 0.18831658363342285, "num_input_tokens_seen": 0, "step": 235 }, { "epoch": 2.776470588235294, "grad_norm": 0.22998847186224078, "learning_rate": 9.027101961941925e-06, "loss": 0.18712544441223145, "num_input_tokens_seen": 0, "step": 236 }, { "epoch": 2.788235294117647, "grad_norm": 0.22929072663885758, "learning_rate": 8.949546968302042e-06, "loss": 0.20112478733062744, "num_input_tokens_seen": 0, "step": 237 }, { "epoch": 2.8, "grad_norm": 0.23007743920004314, "learning_rate": 8.872055810430881e-06, "loss": 0.18601751327514648, "num_input_tokens_seen": 0, "step": 238 }, { "epoch": 2.8117647058823527, "grad_norm": 0.23659776591024959, "learning_rate": 8.79463319744677e-06, "loss": 0.1808547079563141, "num_input_tokens_seen": 0, "step": 239 }, { "epoch": 2.8235294117647056, "grad_norm": 0.23509031587485976, "learning_rate": 8.717283834302593e-06, "loss": 0.18669113516807556, "num_input_tokens_seen": 0, "step": 240 }, { "epoch": 2.835294117647059, "grad_norm": 0.23472960086401704, "learning_rate": 8.640012421499856e-06, "loss": 0.19292673468589783, "num_input_tokens_seen": 0, "step": 241 }, { "epoch": 2.847058823529412, "grad_norm": 0.25429765516678327, "learning_rate": 8.562823654803035e-06, "loss": 0.18981140851974487, "num_input_tokens_seen": 0, "step": 242 }, { "epoch": 2.8588235294117648, "grad_norm": 0.23658961623110553, "learning_rate": 8.485722224954237e-06, "loss": 0.1999085545539856, "num_input_tokens_seen": 0, "step": 243 }, { "epoch": 2.8705882352941177, "grad_norm": 0.23199472766369356, "learning_rate": 8.408712817388113e-06, "loss": 0.1827118992805481, "num_input_tokens_seen": 0, "step": 244 }, { "epoch": 2.8823529411764706, "grad_norm": 0.25077191290374024, "learning_rate": 8.331800111947158e-06, "loss": 0.1857125163078308, "num_input_tokens_seen": 0, "step": 245 }, { "epoch": 2.8941176470588235, "grad_norm": 0.23140729045512615, "learning_rate": 8.254988782597295e-06, "loss": 0.18820548057556152, "num_input_tokens_seen": 0, "step": 246 }, { "epoch": 2.9058823529411764, "grad_norm": 0.22239003021686357, "learning_rate": 8.178283497143851e-06, "loss": 0.19076308608055115, "num_input_tokens_seen": 0, "step": 247 }, { "epoch": 2.9176470588235293, "grad_norm": 0.21803105840925516, "learning_rate": 8.10168891694789e-06, "loss": 0.18549099564552307, "num_input_tokens_seen": 0, "step": 248 }, { "epoch": 2.9294117647058826, "grad_norm": 0.23828217230216947, "learning_rate": 8.025209696642942e-06, "loss": 0.1965373158454895, "num_input_tokens_seen": 0, "step": 249 }, { "epoch": 2.9411764705882355, "grad_norm": 0.2390427784115555, "learning_rate": 7.948850483852153e-06, "loss": 0.18414372205734253, "num_input_tokens_seen": 0, "step": 250 }, { "epoch": 2.9529411764705884, "grad_norm": 0.24685443360941575, "learning_rate": 7.872615918905833e-06, "loss": 0.19256475567817688, "num_input_tokens_seen": 0, "step": 251 }, { "epoch": 2.9647058823529413, "grad_norm": 0.23889907605167213, "learning_rate": 7.796510634559487e-06, "loss": 0.19201350212097168, "num_input_tokens_seen": 0, "step": 252 }, { "epoch": 2.976470588235294, "grad_norm": 0.24349828471939475, "learning_rate": 7.720539255712252e-06, "loss": 0.18964079022407532, "num_input_tokens_seen": 0, "step": 253 }, { "epoch": 2.988235294117647, "grad_norm": 0.25208339650704936, "learning_rate": 7.644706399125871e-06, "loss": 0.19716620445251465, "num_input_tokens_seen": 0, "step": 254 }, { "epoch": 3.0, "grad_norm": 0.22219530818543262, "learning_rate": 7.569016673144132e-06, "loss": 0.19001775979995728, "num_input_tokens_seen": 0, "step": 255 } ], "logging_steps": 1, "max_steps": 425, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 267247958228992.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }