{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004920049200492005, "grad_norm": 12.925619082345467, "learning_rate": 0.0, "loss": 6.9458, "step": 1 }, { "epoch": 0.00984009840098401, "grad_norm": 12.439802998162127, "learning_rate": 1.6129032258064517e-05, "loss": 6.8649, "step": 2 }, { "epoch": 0.014760147601476014, "grad_norm": 12.51958845793162, "learning_rate": 3.2258064516129034e-05, "loss": 5.6195, "step": 3 }, { "epoch": 0.01968019680196802, "grad_norm": 19.812188208842265, "learning_rate": 4.838709677419355e-05, "loss": 5.1976, "step": 4 }, { "epoch": 0.024600246002460024, "grad_norm": 12.520742078945409, "learning_rate": 6.451612903225807e-05, "loss": 4.8432, "step": 5 }, { "epoch": 0.02952029520295203, "grad_norm": 11.250780543272151, "learning_rate": 8.064516129032258e-05, "loss": 4.3401, "step": 6 }, { "epoch": 0.03444034440344403, "grad_norm": 9.30610097124001, "learning_rate": 9.67741935483871e-05, "loss": 4.3228, "step": 7 }, { "epoch": 0.03936039360393604, "grad_norm": 9.168589780177875, "learning_rate": 0.00011290322580645161, "loss": 4.3951, "step": 8 }, { "epoch": 0.04428044280442804, "grad_norm": 10.120899259751193, "learning_rate": 0.00012903225806451613, "loss": 4.3329, "step": 9 }, { "epoch": 0.04920049200492005, "grad_norm": 16.209259134418662, "learning_rate": 0.00014516129032258066, "loss": 4.665, "step": 10 }, { "epoch": 0.05412054120541206, "grad_norm": 15.206855189919262, "learning_rate": 0.00016129032258064516, "loss": 4.5063, "step": 11 }, { "epoch": 0.05904059040590406, "grad_norm": 4.542759273855053, "learning_rate": 0.0001774193548387097, "loss": 4.268, "step": 12 }, { "epoch": 0.06396063960639606, "grad_norm": 3.503249752279149, "learning_rate": 0.0001935483870967742, "loss": 4.1666, "step": 13 }, { "epoch": 0.06888068880688807, "grad_norm": 2.2972536558550405, "learning_rate": 0.00020967741935483871, "loss": 4.1318, "step": 14 }, { "epoch": 0.07380073800738007, "grad_norm": 2.5617740843415526, "learning_rate": 0.00022580645161290321, "loss": 3.6927, "step": 15 }, { "epoch": 0.07872078720787208, "grad_norm": 2.1745920678911204, "learning_rate": 0.00024193548387096774, "loss": 3.5523, "step": 16 }, { "epoch": 0.08364083640836409, "grad_norm": 1.4826825303467723, "learning_rate": 0.00025806451612903227, "loss": 3.6399, "step": 17 }, { "epoch": 0.08856088560885608, "grad_norm": 1.669307391999787, "learning_rate": 0.00027419354838709674, "loss": 3.5459, "step": 18 }, { "epoch": 0.09348093480934809, "grad_norm": 1.4439424705757826, "learning_rate": 0.0002903225806451613, "loss": 3.6782, "step": 19 }, { "epoch": 0.0984009840098401, "grad_norm": 2.616089636573695, "learning_rate": 0.0003064516129032258, "loss": 3.7196, "step": 20 }, { "epoch": 0.1033210332103321, "grad_norm": 2.5803385277636868, "learning_rate": 0.0003225806451612903, "loss": 3.2824, "step": 21 }, { "epoch": 0.10824108241082411, "grad_norm": 3.307901416742095, "learning_rate": 0.00033870967741935485, "loss": 3.4764, "step": 22 }, { "epoch": 0.11316113161131611, "grad_norm": 2.462071193818894, "learning_rate": 0.0003548387096774194, "loss": 3.3546, "step": 23 }, { "epoch": 0.11808118081180811, "grad_norm": 1.8691920960864983, "learning_rate": 0.0003709677419354839, "loss": 3.5888, "step": 24 }, { "epoch": 0.12300123001230012, "grad_norm": 3.278086128753021, "learning_rate": 0.0003870967741935484, "loss": 3.5084, "step": 25 }, { "epoch": 0.12792127921279212, "grad_norm": 5.427526078057263, "learning_rate": 0.0004032258064516129, "loss": 3.0548, "step": 26 }, { "epoch": 0.13284132841328414, "grad_norm": 8.252250934233532, "learning_rate": 0.00041935483870967743, "loss": 3.8151, "step": 27 }, { "epoch": 0.13776137761377613, "grad_norm": 6.184610478127356, "learning_rate": 0.00043548387096774196, "loss": 3.6137, "step": 28 }, { "epoch": 0.14268142681426815, "grad_norm": 8.855965077680962, "learning_rate": 0.00045161290322580643, "loss": 3.7698, "step": 29 }, { "epoch": 0.14760147601476015, "grad_norm": 6.007501592474108, "learning_rate": 0.00046774193548387096, "loss": 3.8248, "step": 30 }, { "epoch": 0.15252152521525214, "grad_norm": 5.798915999860381, "learning_rate": 0.0004838709677419355, "loss": 3.652, "step": 31 }, { "epoch": 0.15744157441574416, "grad_norm": 1.946807649198802, "learning_rate": 0.0005, "loss": 3.1857, "step": 32 }, { "epoch": 0.16236162361623616, "grad_norm": 2.3116846083617713, "learning_rate": 0.0005161290322580645, "loss": 2.9999, "step": 33 }, { "epoch": 0.16728167281672818, "grad_norm": 1.8292280351780028, "learning_rate": 0.0005322580645161291, "loss": 2.8625, "step": 34 }, { "epoch": 0.17220172201722017, "grad_norm": 2.3954293355446774, "learning_rate": 0.0005483870967741935, "loss": 2.7524, "step": 35 }, { "epoch": 0.17712177121771217, "grad_norm": 2.8458917352745465, "learning_rate": 0.0005645161290322581, "loss": 2.8517, "step": 36 }, { "epoch": 0.1820418204182042, "grad_norm": 3.896014561628686, "learning_rate": 0.0005806451612903226, "loss": 2.7568, "step": 37 }, { "epoch": 0.18696186961869618, "grad_norm": 2.4730923101153066, "learning_rate": 0.0005967741935483872, "loss": 2.6654, "step": 38 }, { "epoch": 0.1918819188191882, "grad_norm": 2.9053797748497305, "learning_rate": 0.0006129032258064516, "loss": 2.4254, "step": 39 }, { "epoch": 0.1968019680196802, "grad_norm": 4.963165050290689, "learning_rate": 0.0006290322580645161, "loss": 2.6661, "step": 40 }, { "epoch": 0.2017220172201722, "grad_norm": 8.306802795715427, "learning_rate": 0.0006451612903225806, "loss": 3.4461, "step": 41 }, { "epoch": 0.2066420664206642, "grad_norm": 5.007105408484473, "learning_rate": 0.0006612903225806452, "loss": 2.702, "step": 42 }, { "epoch": 0.2115621156211562, "grad_norm": 2.8714842939325984, "learning_rate": 0.0006774193548387097, "loss": 2.3502, "step": 43 }, { "epoch": 0.21648216482164823, "grad_norm": 14.699297118370827, "learning_rate": 0.0006935483870967742, "loss": 3.2365, "step": 44 }, { "epoch": 0.22140221402214022, "grad_norm": 9.628358419272084, "learning_rate": 0.0007096774193548388, "loss": 4.818, "step": 45 }, { "epoch": 0.22632226322263221, "grad_norm": 6.379085579709408, "learning_rate": 0.0007258064516129033, "loss": 3.337, "step": 46 }, { "epoch": 0.23124231242312424, "grad_norm": 6.977578086174092, "learning_rate": 0.0007419354838709678, "loss": 3.3851, "step": 47 }, { "epoch": 0.23616236162361623, "grad_norm": 9.295348059877952, "learning_rate": 0.0007580645161290322, "loss": 3.5117, "step": 48 }, { "epoch": 0.24108241082410825, "grad_norm": 9.429577746382938, "learning_rate": 0.0007741935483870968, "loss": 5.8779, "step": 49 }, { "epoch": 0.24600246002460024, "grad_norm": 3.5125210012639205, "learning_rate": 0.0007903225806451613, "loss": 4.4818, "step": 50 }, { "epoch": 0.25092250922509224, "grad_norm": 5.435744199655206, "learning_rate": 0.0008064516129032258, "loss": 4.428, "step": 51 }, { "epoch": 0.25584255842558423, "grad_norm": 1.4195571757638457, "learning_rate": 0.0008225806451612903, "loss": 3.9687, "step": 52 }, { "epoch": 0.2607626076260763, "grad_norm": 2.7097627656890557, "learning_rate": 0.0008387096774193549, "loss": 3.4137, "step": 53 }, { "epoch": 0.2656826568265683, "grad_norm": 1.0354544396674221, "learning_rate": 0.0008548387096774194, "loss": 3.1056, "step": 54 }, { "epoch": 0.27060270602706027, "grad_norm": 1.3218120757359644, "learning_rate": 0.0008709677419354839, "loss": 3.5996, "step": 55 }, { "epoch": 0.27552275522755226, "grad_norm": 0.7637843178600343, "learning_rate": 0.0008870967741935484, "loss": 3.5571, "step": 56 }, { "epoch": 0.28044280442804426, "grad_norm": 0.8932813105931199, "learning_rate": 0.0009032258064516129, "loss": 3.7487, "step": 57 }, { "epoch": 0.2853628536285363, "grad_norm": 3.3877765762538194, "learning_rate": 0.0009193548387096774, "loss": 3.8143, "step": 58 }, { "epoch": 0.2902829028290283, "grad_norm": 4.128364896120153, "learning_rate": 0.0009354838709677419, "loss": 4.0149, "step": 59 }, { "epoch": 0.2952029520295203, "grad_norm": 5.446656878174334, "learning_rate": 0.0009516129032258065, "loss": 3.927, "step": 60 }, { "epoch": 0.3001230012300123, "grad_norm": 0.5564695616031174, "learning_rate": 0.000967741935483871, "loss": 3.7795, "step": 61 }, { "epoch": 0.3050430504305043, "grad_norm": 0.1860776384829729, "learning_rate": 0.0009838709677419356, "loss": 4.1841, "step": 62 }, { "epoch": 0.30996309963099633, "grad_norm": 0.15887076170446043, "learning_rate": 0.001, "loss": 3.6392, "step": 63 }, { "epoch": 0.3148831488314883, "grad_norm": 0.14644459978117466, "learning_rate": 0.0009999993693519049, "loss": 3.8136, "step": 64 }, { "epoch": 0.3198031980319803, "grad_norm": 0.1283031998217305, "learning_rate": 0.0009999974774092106, "loss": 3.758, "step": 65 }, { "epoch": 0.3247232472324723, "grad_norm": 0.10629635119493148, "learning_rate": 0.0009999943241766895, "loss": 3.7428, "step": 66 }, { "epoch": 0.3296432964329643, "grad_norm": 0.09312990312972594, "learning_rate": 0.0009999899096622962, "loss": 3.9521, "step": 67 }, { "epoch": 0.33456334563345635, "grad_norm": 0.09046512182854186, "learning_rate": 0.0009999842338771665, "loss": 3.7979, "step": 68 }, { "epoch": 0.33948339483394835, "grad_norm": 0.08946205321786987, "learning_rate": 0.0009999772968356181, "loss": 3.6668, "step": 69 }, { "epoch": 0.34440344403444034, "grad_norm": 0.08912832958757311, "learning_rate": 0.0009999690985551507, "loss": 3.708, "step": 70 }, { "epoch": 0.34932349323493234, "grad_norm": 0.08077941792642568, "learning_rate": 0.0009999596390564446, "loss": 3.9577, "step": 71 }, { "epoch": 0.35424354243542433, "grad_norm": 0.0656658289447873, "learning_rate": 0.0009999489183633629, "loss": 3.6382, "step": 72 }, { "epoch": 0.3591635916359164, "grad_norm": 0.05472179563683537, "learning_rate": 0.0009999369365029486, "loss": 3.8356, "step": 73 }, { "epoch": 0.3640836408364084, "grad_norm": 0.04597337952283742, "learning_rate": 0.0009999236935054282, "loss": 3.9062, "step": 74 }, { "epoch": 0.36900369003690037, "grad_norm": 0.043535223396858114, "learning_rate": 0.0009999091894042076, "loss": 3.8314, "step": 75 }, { "epoch": 0.37392373923739236, "grad_norm": 0.047343388386465526, "learning_rate": 0.0009998934242358751, "loss": 3.9269, "step": 76 }, { "epoch": 0.37884378843788435, "grad_norm": 0.04498695222915248, "learning_rate": 0.0009998763980401998, "loss": 3.9541, "step": 77 }, { "epoch": 0.3837638376383764, "grad_norm": 0.043056806952490874, "learning_rate": 0.0009998581108601314, "loss": 4.0565, "step": 78 }, { "epoch": 0.3886838868388684, "grad_norm": 0.040177421590214396, "learning_rate": 0.0009998385627418016, "loss": 3.9988, "step": 79 }, { "epoch": 0.3936039360393604, "grad_norm": 0.03374787590442813, "learning_rate": 0.000999817753734522, "loss": 3.7781, "step": 80 }, { "epoch": 0.3985239852398524, "grad_norm": 0.029392765712199523, "learning_rate": 0.000999795683890785, "loss": 3.7711, "step": 81 }, { "epoch": 0.4034440344403444, "grad_norm": 0.02665859245120251, "learning_rate": 0.0009997723532662644, "loss": 3.5125, "step": 82 }, { "epoch": 0.40836408364083643, "grad_norm": 0.02704087809873294, "learning_rate": 0.0009997477619198137, "loss": 3.7158, "step": 83 }, { "epoch": 0.4132841328413284, "grad_norm": 0.024799014994737852, "learning_rate": 0.0009997219099134665, "loss": 3.9818, "step": 84 }, { "epoch": 0.4182041820418204, "grad_norm": 0.02410382826709373, "learning_rate": 0.0009996947973124372, "loss": 4.0264, "step": 85 }, { "epoch": 0.4231242312423124, "grad_norm": 0.021976857679818027, "learning_rate": 0.0009996664241851197, "loss": 3.7906, "step": 86 }, { "epoch": 0.4280442804428044, "grad_norm": 0.02114358235234028, "learning_rate": 0.000999636790603088, "loss": 3.7098, "step": 87 }, { "epoch": 0.43296432964329645, "grad_norm": 0.019039713734478464, "learning_rate": 0.0009996058966410953, "loss": 3.9083, "step": 88 }, { "epoch": 0.43788437884378845, "grad_norm": 0.017806998707319197, "learning_rate": 0.0009995737423770744, "loss": 3.7642, "step": 89 }, { "epoch": 0.44280442804428044, "grad_norm": 0.017783545719065885, "learning_rate": 0.0009995403278921379, "loss": 3.9162, "step": 90 }, { "epoch": 0.44772447724477243, "grad_norm": 0.016736700446281706, "learning_rate": 0.0009995056532705764, "loss": 3.7833, "step": 91 }, { "epoch": 0.45264452644526443, "grad_norm": 0.01825949796883451, "learning_rate": 0.0009994697185998602, "loss": 3.8724, "step": 92 }, { "epoch": 0.4575645756457565, "grad_norm": 0.015353011470520822, "learning_rate": 0.0009994325239706377, "loss": 3.7997, "step": 93 }, { "epoch": 0.46248462484624847, "grad_norm": 0.013800899406207419, "learning_rate": 0.0009993940694767358, "loss": 3.8333, "step": 94 }, { "epoch": 0.46740467404674046, "grad_norm": 0.013872690358034286, "learning_rate": 0.0009993543552151594, "loss": 3.6946, "step": 95 }, { "epoch": 0.47232472324723246, "grad_norm": 0.01489320276267857, "learning_rate": 0.0009993133812860916, "loss": 3.8645, "step": 96 }, { "epoch": 0.47724477244772445, "grad_norm": 0.010976506372676402, "learning_rate": 0.0009992711477928924, "loss": 3.8345, "step": 97 }, { "epoch": 0.4821648216482165, "grad_norm": 0.011484036433573718, "learning_rate": 0.0009992276548421004, "loss": 3.6816, "step": 98 }, { "epoch": 0.4870848708487085, "grad_norm": 0.012343394879271503, "learning_rate": 0.0009991829025434303, "loss": 3.5457, "step": 99 }, { "epoch": 0.4920049200492005, "grad_norm": 0.01067423621804686, "learning_rate": 0.0009991368910097739, "loss": 3.6754, "step": 100 }, { "epoch": 0.4969249692496925, "grad_norm": 0.012274651224538557, "learning_rate": 0.0009990896203571995, "loss": 3.633, "step": 101 }, { "epoch": 0.5018450184501845, "grad_norm": 0.009300124762404886, "learning_rate": 0.0009990410907049516, "loss": 3.7742, "step": 102 }, { "epoch": 0.5067650676506765, "grad_norm": 0.008661380495732098, "learning_rate": 0.000998991302175451, "loss": 3.9134, "step": 103 }, { "epoch": 0.5116851168511685, "grad_norm": 0.008865091779315869, "learning_rate": 0.0009989402548942934, "loss": 3.8769, "step": 104 }, { "epoch": 0.5166051660516605, "grad_norm": 0.00718174558899967, "learning_rate": 0.000998887948990251, "loss": 3.5728, "step": 105 }, { "epoch": 0.5215252152521526, "grad_norm": 0.006814752348531558, "learning_rate": 0.0009988343845952696, "loss": 3.5061, "step": 106 }, { "epoch": 0.5264452644526445, "grad_norm": 0.008168253056158146, "learning_rate": 0.0009987795618444707, "loss": 3.7847, "step": 107 }, { "epoch": 0.5313653136531366, "grad_norm": 0.008603670076908654, "learning_rate": 0.0009987234808761496, "loss": 3.8071, "step": 108 }, { "epoch": 0.5362853628536285, "grad_norm": 0.008304658800334567, "learning_rate": 0.0009986661418317758, "loss": 3.5767, "step": 109 }, { "epoch": 0.5412054120541205, "grad_norm": 0.006183264704939193, "learning_rate": 0.0009986075448559925, "loss": 3.6441, "step": 110 }, { "epoch": 0.5461254612546126, "grad_norm": 0.0064605929091642806, "learning_rate": 0.0009985476900966155, "loss": 3.7029, "step": 111 }, { "epoch": 0.5510455104551045, "grad_norm": 0.007485257726120109, "learning_rate": 0.0009984865777046344, "loss": 3.8985, "step": 112 }, { "epoch": 0.5559655596555966, "grad_norm": 0.005295945932979785, "learning_rate": 0.0009984242078342107, "loss": 3.7772, "step": 113 }, { "epoch": 0.5608856088560885, "grad_norm": 0.005265347384816981, "learning_rate": 0.0009983605806426783, "loss": 3.6679, "step": 114 }, { "epoch": 0.5658056580565806, "grad_norm": 0.005163082669917038, "learning_rate": 0.0009982956962905423, "loss": 3.7457, "step": 115 }, { "epoch": 0.5707257072570726, "grad_norm": 0.005024792777539467, "learning_rate": 0.00099822955494148, "loss": 3.8815, "step": 116 }, { "epoch": 0.5756457564575646, "grad_norm": 0.004379271020490377, "learning_rate": 0.0009981621567623385, "loss": 3.6623, "step": 117 }, { "epoch": 0.5805658056580566, "grad_norm": 0.003993159888472669, "learning_rate": 0.0009980935019231363, "loss": 3.8111, "step": 118 }, { "epoch": 0.5854858548585485, "grad_norm": 0.004462283208556875, "learning_rate": 0.0009980235905970615, "loss": 3.9837, "step": 119 }, { "epoch": 0.5904059040590406, "grad_norm": 0.003817651740847746, "learning_rate": 0.0009979524229604719, "loss": 3.673, "step": 120 }, { "epoch": 0.5953259532595326, "grad_norm": 0.004195115229468422, "learning_rate": 0.0009978799991928943, "loss": 3.7004, "step": 121 }, { "epoch": 0.6002460024600246, "grad_norm": 0.004656073220021694, "learning_rate": 0.0009978063194770246, "loss": 3.8333, "step": 122 }, { "epoch": 0.6051660516605166, "grad_norm": 0.00585591159216277, "learning_rate": 0.0009977313839987264, "loss": 3.7221, "step": 123 }, { "epoch": 0.6100861008610086, "grad_norm": 0.006317142851981135, "learning_rate": 0.0009976551929470315, "loss": 3.1623, "step": 124 }, { "epoch": 0.6150061500615006, "grad_norm": 0.004130368432956977, "learning_rate": 0.000997577746514139, "loss": 3.7774, "step": 125 }, { "epoch": 0.6199261992619927, "grad_norm": 0.006813176860685383, "learning_rate": 0.0009974990448954144, "loss": 3.8647, "step": 126 }, { "epoch": 0.6248462484624846, "grad_norm": 0.00426321328457737, "learning_rate": 0.0009974190882893902, "loss": 3.7184, "step": 127 }, { "epoch": 0.6297662976629766, "grad_norm": 0.006060841467500006, "learning_rate": 0.0009973378768977639, "loss": 3.3221, "step": 128 }, { "epoch": 0.6346863468634686, "grad_norm": 0.0044305555328754, "learning_rate": 0.0009972554109253988, "loss": 3.6133, "step": 129 }, { "epoch": 0.6396063960639606, "grad_norm": 0.0059752941568840786, "learning_rate": 0.0009971716905803232, "loss": 3.9841, "step": 130 }, { "epoch": 0.6445264452644527, "grad_norm": 0.006831428063363153, "learning_rate": 0.0009970867160737293, "loss": 3.5516, "step": 131 }, { "epoch": 0.6494464944649446, "grad_norm": 0.004011193650054993, "learning_rate": 0.0009970004876199729, "loss": 3.9436, "step": 132 }, { "epoch": 0.6543665436654367, "grad_norm": 0.004598616921108332, "learning_rate": 0.0009969130054365736, "loss": 3.7476, "step": 133 }, { "epoch": 0.6592865928659286, "grad_norm": 0.003436411428204292, "learning_rate": 0.0009968242697442132, "loss": 3.5505, "step": 134 }, { "epoch": 0.6642066420664207, "grad_norm": 0.0024956925590309733, "learning_rate": 0.0009967342807667354, "loss": 3.8556, "step": 135 }, { "epoch": 0.6691266912669127, "grad_norm": 0.007834754554807507, "learning_rate": 0.000996643038731146, "loss": 3.8144, "step": 136 }, { "epoch": 0.6740467404674046, "grad_norm": 0.003097365929296072, "learning_rate": 0.0009965505438676114, "loss": 4.1118, "step": 137 }, { "epoch": 0.6789667896678967, "grad_norm": 0.002841535000915365, "learning_rate": 0.0009964567964094585, "loss": 3.9616, "step": 138 }, { "epoch": 0.6838868388683886, "grad_norm": 0.005264306113158407, "learning_rate": 0.0009963617965931737, "loss": 3.8144, "step": 139 }, { "epoch": 0.6888068880688807, "grad_norm": 0.009380084612995379, "learning_rate": 0.000996265544658403, "loss": 3.6356, "step": 140 }, { "epoch": 0.6937269372693727, "grad_norm": 0.003815028609187437, "learning_rate": 0.0009961680408479509, "loss": 3.7873, "step": 141 }, { "epoch": 0.6986469864698647, "grad_norm": 0.004145241580947982, "learning_rate": 0.0009960692854077795, "loss": 3.461, "step": 142 }, { "epoch": 0.7035670356703567, "grad_norm": 0.00597540613087557, "learning_rate": 0.0009959692785870085, "loss": 3.4951, "step": 143 }, { "epoch": 0.7084870848708487, "grad_norm": 0.0048928957416064485, "learning_rate": 0.0009958680206379148, "loss": 3.8143, "step": 144 }, { "epoch": 0.7134071340713407, "grad_norm": 0.01558958011198066, "learning_rate": 0.0009957655118159304, "loss": 3.8976, "step": 145 }, { "epoch": 0.7183271832718328, "grad_norm": 0.008190132251842834, "learning_rate": 0.0009956617523796435, "loss": 3.7241, "step": 146 }, { "epoch": 0.7232472324723247, "grad_norm": 0.004132598413187414, "learning_rate": 0.0009955567425907967, "loss": 3.8081, "step": 147 }, { "epoch": 0.7281672816728167, "grad_norm": 0.005246689952193781, "learning_rate": 0.0009954504827142873, "loss": 4.1163, "step": 148 }, { "epoch": 0.7330873308733087, "grad_norm": 0.006937935495120278, "learning_rate": 0.0009953429730181654, "loss": 3.9033, "step": 149 }, { "epoch": 0.7380073800738007, "grad_norm": 0.00553281282341774, "learning_rate": 0.000995234213773634, "loss": 3.7663, "step": 150 }, { "epoch": 0.7429274292742928, "grad_norm": 0.010052630350042566, "learning_rate": 0.0009951242052550486, "loss": 3.8507, "step": 151 }, { "epoch": 0.7478474784747847, "grad_norm": 0.009146194349532387, "learning_rate": 0.0009950129477399156, "loss": 3.9986, "step": 152 }, { "epoch": 0.7527675276752768, "grad_norm": 0.004254927451414939, "learning_rate": 0.0009949004415088928, "loss": 3.756, "step": 153 }, { "epoch": 0.7576875768757687, "grad_norm": 0.004566258009854794, "learning_rate": 0.000994786686845787, "loss": 3.9319, "step": 154 }, { "epoch": 0.7626076260762608, "grad_norm": 0.004343826467718725, "learning_rate": 0.0009946716840375553, "loss": 3.7465, "step": 155 }, { "epoch": 0.7675276752767528, "grad_norm": 0.010104837197957616, "learning_rate": 0.0009945554333743024, "loss": 3.9226, "step": 156 }, { "epoch": 0.7724477244772447, "grad_norm": 0.009699543416263633, "learning_rate": 0.0009944379351492817, "loss": 3.5363, "step": 157 }, { "epoch": 0.7773677736777368, "grad_norm": 0.005582043528402483, "learning_rate": 0.0009943191896588934, "loss": 3.8121, "step": 158 }, { "epoch": 0.7822878228782287, "grad_norm": 0.005963841843111033, "learning_rate": 0.0009941991972026837, "loss": 3.5475, "step": 159 }, { "epoch": 0.7872078720787208, "grad_norm": 0.0033846933589833355, "learning_rate": 0.0009940779580833452, "loss": 3.8472, "step": 160 }, { "epoch": 0.7921279212792128, "grad_norm": 0.006813739904324702, "learning_rate": 0.000993955472606714, "loss": 3.1636, "step": 161 }, { "epoch": 0.7970479704797048, "grad_norm": 0.006873595837126895, "learning_rate": 0.0009938317410817716, "loss": 3.6183, "step": 162 }, { "epoch": 0.8019680196801968, "grad_norm": 0.005019928509172446, "learning_rate": 0.000993706763820642, "loss": 3.7982, "step": 163 }, { "epoch": 0.8068880688806888, "grad_norm": 0.009794104710420169, "learning_rate": 0.0009935805411385916, "loss": 3.4254, "step": 164 }, { "epoch": 0.8118081180811808, "grad_norm": 0.005362481306986823, "learning_rate": 0.0009934530733540293, "loss": 3.9166, "step": 165 }, { "epoch": 0.8167281672816729, "grad_norm": 0.00453441276652789, "learning_rate": 0.0009933243607885042, "loss": 3.8026, "step": 166 }, { "epoch": 0.8216482164821648, "grad_norm": 0.010503299671552791, "learning_rate": 0.0009931944037667056, "loss": 3.926, "step": 167 }, { "epoch": 0.8265682656826568, "grad_norm": 0.009176533678634389, "learning_rate": 0.0009930632026164618, "loss": 3.5276, "step": 168 }, { "epoch": 0.8314883148831488, "grad_norm": 0.009503319741778732, "learning_rate": 0.0009929307576687404, "loss": 3.438, "step": 169 }, { "epoch": 0.8364083640836408, "grad_norm": 0.004566861653732803, "learning_rate": 0.0009927970692576455, "loss": 3.7538, "step": 170 }, { "epoch": 0.8413284132841329, "grad_norm": 0.0049156409518450045, "learning_rate": 0.0009926621377204187, "loss": 3.9265, "step": 171 }, { "epoch": 0.8462484624846248, "grad_norm": 0.008302885348502133, "learning_rate": 0.0009925259633974373, "loss": 3.664, "step": 172 }, { "epoch": 0.8511685116851169, "grad_norm": 0.004948228517663498, "learning_rate": 0.0009923885466322135, "loss": 3.9182, "step": 173 }, { "epoch": 0.8560885608856088, "grad_norm": 0.009213133758919082, "learning_rate": 0.0009922498877713937, "loss": 3.9926, "step": 174 }, { "epoch": 0.8610086100861009, "grad_norm": 0.009857177360760583, "learning_rate": 0.000992109987164758, "loss": 3.3913, "step": 175 }, { "epoch": 0.8659286592865929, "grad_norm": 0.005135731548898062, "learning_rate": 0.0009919688451652184, "loss": 3.9237, "step": 176 }, { "epoch": 0.8708487084870848, "grad_norm": 0.010887504031319786, "learning_rate": 0.0009918264621288186, "loss": 3.7417, "step": 177 }, { "epoch": 0.8757687576875769, "grad_norm": 0.01010631352493894, "learning_rate": 0.000991682838414733, "loss": 3.8393, "step": 178 }, { "epoch": 0.8806888068880688, "grad_norm": 0.015058898654133495, "learning_rate": 0.0009915379743852658, "loss": 3.8664, "step": 179 }, { "epoch": 0.8856088560885609, "grad_norm": 0.017208330822043674, "learning_rate": 0.0009913918704058497, "loss": 3.8243, "step": 180 }, { "epoch": 0.8905289052890529, "grad_norm": 0.014609418927871489, "learning_rate": 0.0009912445268450457, "loss": 3.7243, "step": 181 }, { "epoch": 0.8954489544895449, "grad_norm": 0.012937410702504397, "learning_rate": 0.0009910959440745414, "loss": 3.303, "step": 182 }, { "epoch": 0.9003690036900369, "grad_norm": 0.012801935374627647, "learning_rate": 0.0009909461224691504, "loss": 3.6529, "step": 183 }, { "epoch": 0.9052890528905289, "grad_norm": 0.011916501283711382, "learning_rate": 0.000990795062406812, "loss": 3.5944, "step": 184 }, { "epoch": 0.9102091020910209, "grad_norm": 0.012554422060898077, "learning_rate": 0.0009906427642685889, "loss": 3.8442, "step": 185 }, { "epoch": 0.915129151291513, "grad_norm": 0.013149591499044128, "learning_rate": 0.000990489228438667, "loss": 3.5855, "step": 186 }, { "epoch": 0.9200492004920049, "grad_norm": 0.030898384666792792, "learning_rate": 0.000990334455304355, "loss": 3.4933, "step": 187 }, { "epoch": 0.9249692496924969, "grad_norm": 0.023776803907652563, "learning_rate": 0.0009901784452560822, "loss": 3.6596, "step": 188 }, { "epoch": 0.9298892988929889, "grad_norm": 0.026329729550749793, "learning_rate": 0.0009900211986873986, "loss": 3.6288, "step": 189 }, { "epoch": 0.9348093480934809, "grad_norm": 0.025607858838320275, "learning_rate": 0.0009898627159949727, "loss": 3.736, "step": 190 }, { "epoch": 0.939729397293973, "grad_norm": 0.03963240646442274, "learning_rate": 0.0009897029975785923, "loss": 3.4148, "step": 191 }, { "epoch": 0.9446494464944649, "grad_norm": 0.08192379546804401, "learning_rate": 0.0009895420438411615, "loss": 3.2711, "step": 192 }, { "epoch": 0.949569495694957, "grad_norm": 0.0985009260938394, "learning_rate": 0.000989379855188701, "loss": 3.8019, "step": 193 }, { "epoch": 0.9544895448954489, "grad_norm": 0.043900586894409496, "learning_rate": 0.000989216432030347, "loss": 3.7392, "step": 194 }, { "epoch": 0.959409594095941, "grad_norm": 0.0689655489481743, "learning_rate": 0.000989051774778349, "loss": 3.9053, "step": 195 }, { "epoch": 0.964329643296433, "grad_norm": 0.08653988789403717, "learning_rate": 0.0009888858838480706, "loss": 3.8794, "step": 196 }, { "epoch": 0.9692496924969249, "grad_norm": 0.06925606637171328, "learning_rate": 0.0009887187596579864, "loss": 3.8982, "step": 197 }, { "epoch": 0.974169741697417, "grad_norm": 0.06481460546921952, "learning_rate": 0.0009885504026296834, "loss": 3.6986, "step": 198 }, { "epoch": 0.9790897908979089, "grad_norm": 0.1266909366307376, "learning_rate": 0.0009883808131878572, "loss": 3.535, "step": 199 }, { "epoch": 0.984009840098401, "grad_norm": 0.3805925587298384, "learning_rate": 0.000988209991760313, "loss": 3.904, "step": 200 }, { "epoch": 0.988929889298893, "grad_norm": 0.1263317502293622, "learning_rate": 0.0009880379387779636, "loss": 3.8075, "step": 201 }, { "epoch": 0.993849938499385, "grad_norm": 0.20839896285423296, "learning_rate": 0.0009878646546748286, "loss": 3.7619, "step": 202 }, { "epoch": 0.998769987699877, "grad_norm": 0.20444450501882425, "learning_rate": 0.000987690139888033, "loss": 3.6556, "step": 203 }, { "epoch": 1.0, "grad_norm": 0.20444450501882425, "learning_rate": 0.0009875143948578067, "loss": 0.9351, "step": 204 }, { "epoch": 1.004920049200492, "grad_norm": 0.09478251602044101, "learning_rate": 0.0009873374200274824, "loss": 3.6747, "step": 205 }, { "epoch": 1.009840098400984, "grad_norm": 0.0941444684947407, "learning_rate": 0.0009871592158434961, "loss": 3.5003, "step": 206 }, { "epoch": 1.014760147601476, "grad_norm": 0.2306963279437358, "learning_rate": 0.0009869797827553838, "loss": 4.0016, "step": 207 }, { "epoch": 1.019680196801968, "grad_norm": 0.3303114410647283, "learning_rate": 0.000986799121215782, "loss": 3.6021, "step": 208 }, { "epoch": 1.0246002460024601, "grad_norm": 0.18433196698803897, "learning_rate": 0.0009866172316804265, "loss": 3.7935, "step": 209 }, { "epoch": 1.029520295202952, "grad_norm": 0.1748888199159697, "learning_rate": 0.0009864341146081502, "loss": 3.5345, "step": 210 }, { "epoch": 1.034440344403444, "grad_norm": 0.15474637391376728, "learning_rate": 0.0009862497704608828, "loss": 3.7451, "step": 211 }, { "epoch": 1.039360393603936, "grad_norm": 0.08628865296812444, "learning_rate": 0.0009860641997036497, "loss": 3.769, "step": 212 }, { "epoch": 1.044280442804428, "grad_norm": 0.23809011608101877, "learning_rate": 0.0009858774028045699, "loss": 3.729, "step": 213 }, { "epoch": 1.04920049200492, "grad_norm": 0.18824830408999266, "learning_rate": 0.0009856893802348563, "loss": 3.8546, "step": 214 }, { "epoch": 1.054120541205412, "grad_norm": 0.2574179447004306, "learning_rate": 0.0009855001324688127, "loss": 3.5298, "step": 215 }, { "epoch": 1.0590405904059041, "grad_norm": 0.16565880916156459, "learning_rate": 0.0009853096599838345, "loss": 3.5833, "step": 216 }, { "epoch": 1.063960639606396, "grad_norm": 0.2118256774642086, "learning_rate": 0.0009851179632604057, "loss": 3.6864, "step": 217 }, { "epoch": 1.068880688806888, "grad_norm": 0.3898041190016308, "learning_rate": 0.0009849250427820995, "loss": 3.729, "step": 218 }, { "epoch": 1.07380073800738, "grad_norm": 0.4653482042564478, "learning_rate": 0.0009847308990355752, "loss": 3.8538, "step": 219 }, { "epoch": 1.0787207872078721, "grad_norm": 0.8359476966510965, "learning_rate": 0.0009845355325105785, "loss": 3.6701, "step": 220 }, { "epoch": 1.083640836408364, "grad_norm": 0.3102798276450012, "learning_rate": 0.0009843389436999396, "loss": 3.7686, "step": 221 }, { "epoch": 1.088560885608856, "grad_norm": 0.7690704111857389, "learning_rate": 0.0009841411330995717, "loss": 3.5764, "step": 222 }, { "epoch": 1.0934809348093482, "grad_norm": 0.2384664805026588, "learning_rate": 0.0009839421012084709, "loss": 3.6365, "step": 223 }, { "epoch": 1.09840098400984, "grad_norm": 0.5654711872402559, "learning_rate": 0.0009837418485287126, "loss": 3.8112, "step": 224 }, { "epoch": 1.103321033210332, "grad_norm": 0.3858041722567133, "learning_rate": 0.0009835403755654535, "loss": 3.3975, "step": 225 }, { "epoch": 1.1082410824108242, "grad_norm": 0.6198275550100018, "learning_rate": 0.0009833376828269273, "loss": 3.6216, "step": 226 }, { "epoch": 1.1131611316113161, "grad_norm": 0.4993180290625588, "learning_rate": 0.0009831337708244453, "loss": 3.8742, "step": 227 }, { "epoch": 1.118081180811808, "grad_norm": 0.4402499870650823, "learning_rate": 0.0009829286400723945, "loss": 3.8792, "step": 228 }, { "epoch": 1.1230012300123002, "grad_norm": 0.3762913075276763, "learning_rate": 0.0009827222910882359, "loss": 3.9433, "step": 229 }, { "epoch": 1.1279212792127922, "grad_norm": 0.49651040125049095, "learning_rate": 0.000982514724392504, "loss": 3.4985, "step": 230 }, { "epoch": 1.132841328413284, "grad_norm": 0.42554000333749126, "learning_rate": 0.000982305940508805, "loss": 3.4263, "step": 231 }, { "epoch": 1.137761377613776, "grad_norm": 0.530705322924196, "learning_rate": 0.0009820959399638157, "loss": 3.7069, "step": 232 }, { "epoch": 1.1426814268142682, "grad_norm": 0.29482232131486374, "learning_rate": 0.0009818847232872813, "loss": 3.6482, "step": 233 }, { "epoch": 1.1476014760147601, "grad_norm": 0.2099839675441405, "learning_rate": 0.000981672291012016, "loss": 3.6422, "step": 234 }, { "epoch": 1.152521525215252, "grad_norm": 0.22112570416941696, "learning_rate": 0.0009814586436738997, "loss": 3.4488, "step": 235 }, { "epoch": 1.1574415744157442, "grad_norm": 0.11385488525868268, "learning_rate": 0.0009812437818118773, "loss": 3.755, "step": 236 }, { "epoch": 1.1623616236162362, "grad_norm": 3.7642998258010123, "learning_rate": 0.000981027705967958, "loss": 3.5462, "step": 237 }, { "epoch": 1.1672816728167281, "grad_norm": 0.6201764434939466, "learning_rate": 0.0009808104166872127, "loss": 3.7395, "step": 238 }, { "epoch": 1.17220172201722, "grad_norm": 0.586001244781305, "learning_rate": 0.000980591914517774, "loss": 3.6089, "step": 239 }, { "epoch": 1.1771217712177122, "grad_norm": 0.12961439190323135, "learning_rate": 0.0009803722000108338, "loss": 3.6063, "step": 240 }, { "epoch": 1.1820418204182042, "grad_norm": 0.2688388532547322, "learning_rate": 0.0009801512737206421, "loss": 3.6857, "step": 241 }, { "epoch": 1.186961869618696, "grad_norm": 0.9745964151296017, "learning_rate": 0.000979929136204506, "loss": 3.6506, "step": 242 }, { "epoch": 1.1918819188191883, "grad_norm": 0.164274343911916, "learning_rate": 0.0009797057880227879, "loss": 3.7371, "step": 243 }, { "epoch": 1.1968019680196802, "grad_norm": 0.5941646091961867, "learning_rate": 0.0009794812297389038, "loss": 3.5829, "step": 244 }, { "epoch": 1.2017220172201721, "grad_norm": 0.30305064002921134, "learning_rate": 0.0009792554619193234, "loss": 3.4182, "step": 245 }, { "epoch": 1.2066420664206643, "grad_norm": 0.3652438605824011, "learning_rate": 0.0009790284851335664, "loss": 3.4222, "step": 246 }, { "epoch": 1.2115621156211562, "grad_norm": 0.2323728784520893, "learning_rate": 0.000978800299954203, "loss": 3.6023, "step": 247 }, { "epoch": 1.2164821648216482, "grad_norm": 0.6359015261923908, "learning_rate": 0.0009785709069568512, "loss": 3.5869, "step": 248 }, { "epoch": 1.2214022140221403, "grad_norm": 0.24932938440900163, "learning_rate": 0.0009783403067201762, "loss": 3.7417, "step": 249 }, { "epoch": 1.2263222632226323, "grad_norm": 0.29442612283409253, "learning_rate": 0.000978108499825888, "loss": 3.5617, "step": 250 }, { "epoch": 1.2312423124231242, "grad_norm": 0.6250891912525537, "learning_rate": 0.0009778754868587415, "loss": 3.4674, "step": 251 }, { "epoch": 1.2361623616236161, "grad_norm": 0.23452020287784042, "learning_rate": 0.000977641268406533, "loss": 3.544, "step": 252 }, { "epoch": 1.2410824108241083, "grad_norm": 0.3967384259496121, "learning_rate": 0.0009774058450601002, "loss": 3.6445, "step": 253 }, { "epoch": 1.2460024600246002, "grad_norm": 0.4135256445633041, "learning_rate": 0.0009771692174133206, "loss": 3.3449, "step": 254 }, { "epoch": 1.2509225092250922, "grad_norm": 0.34618067165726235, "learning_rate": 0.000976931386063109, "loss": 3.3063, "step": 255 }, { "epoch": 1.2558425584255843, "grad_norm": 0.20852684884483266, "learning_rate": 0.0009766923516094169, "loss": 3.9835, "step": 256 }, { "epoch": 1.2607626076260763, "grad_norm": 0.44550308690648577, "learning_rate": 0.0009764521146552309, "loss": 3.427, "step": 257 }, { "epoch": 1.2656826568265682, "grad_norm": 0.7150150644386788, "learning_rate": 0.000976210675806571, "loss": 3.5343, "step": 258 }, { "epoch": 1.2706027060270602, "grad_norm": 0.40806003183988315, "learning_rate": 0.0009759680356724887, "loss": 3.6619, "step": 259 }, { "epoch": 1.2755227552275523, "grad_norm": 0.3628227763883104, "learning_rate": 0.0009757241948650665, "loss": 3.3594, "step": 260 }, { "epoch": 1.2804428044280443, "grad_norm": 0.4906030075174576, "learning_rate": 0.0009754791539994152, "loss": 3.3433, "step": 261 }, { "epoch": 1.2853628536285364, "grad_norm": 0.41483321889345176, "learning_rate": 0.0009752329136936732, "loss": 2.8242, "step": 262 }, { "epoch": 1.2902829028290284, "grad_norm": 0.4518318081713362, "learning_rate": 0.0009749854745690041, "loss": 2.8514, "step": 263 }, { "epoch": 1.2952029520295203, "grad_norm": 0.7837589838635087, "learning_rate": 0.0009747368372495962, "loss": 3.724, "step": 264 }, { "epoch": 1.3001230012300122, "grad_norm": 0.4011976796169938, "learning_rate": 0.0009744870023626599, "loss": 3.7361, "step": 265 }, { "epoch": 1.3050430504305042, "grad_norm": 1.0994650995805915, "learning_rate": 0.0009742359705384269, "loss": 3.9524, "step": 266 }, { "epoch": 1.3099630996309963, "grad_norm": 0.4301325294163687, "learning_rate": 0.0009739837424101483, "loss": 3.761, "step": 267 }, { "epoch": 1.3148831488314883, "grad_norm": 1.565233399119502, "learning_rate": 0.0009737303186140927, "loss": 3.6346, "step": 268 }, { "epoch": 1.3198031980319804, "grad_norm": 0.37889558683703384, "learning_rate": 0.0009734756997895449, "loss": 3.5199, "step": 269 }, { "epoch": 1.3247232472324724, "grad_norm": 0.3458044512552722, "learning_rate": 0.0009732198865788047, "loss": 3.4487, "step": 270 }, { "epoch": 1.3296432964329643, "grad_norm": 0.6772185934092818, "learning_rate": 0.0009729628796271843, "loss": 3.6103, "step": 271 }, { "epoch": 1.3345633456334562, "grad_norm": 0.7141420429862583, "learning_rate": 0.0009727046795830078, "loss": 3.3104, "step": 272 }, { "epoch": 1.3394833948339484, "grad_norm": 0.4120267781758608, "learning_rate": 0.0009724452870976083, "loss": 3.3661, "step": 273 }, { "epoch": 1.3444034440344403, "grad_norm": 0.2651575863136014, "learning_rate": 0.0009721847028253276, "loss": 3.9093, "step": 274 }, { "epoch": 1.3493234932349323, "grad_norm": 0.4512140128337065, "learning_rate": 0.0009719229274235134, "loss": 3.576, "step": 275 }, { "epoch": 1.3542435424354244, "grad_norm": 0.5584213382397976, "learning_rate": 0.0009716599615525183, "loss": 3.3143, "step": 276 }, { "epoch": 1.3591635916359164, "grad_norm": 0.3494792560205918, "learning_rate": 0.0009713958058756984, "loss": 3.6465, "step": 277 }, { "epoch": 1.3640836408364083, "grad_norm": 1.62213007248713, "learning_rate": 0.0009711304610594102, "loss": 3.4472, "step": 278 }, { "epoch": 1.3690036900369003, "grad_norm": 0.29246343627977933, "learning_rate": 0.0009708639277730111, "loss": 3.3673, "step": 279 }, { "epoch": 1.3739237392373924, "grad_norm": 0.7619730439753597, "learning_rate": 0.0009705962066888554, "loss": 3.6782, "step": 280 }, { "epoch": 1.3788437884378844, "grad_norm": 0.2574250969410459, "learning_rate": 0.0009703272984822946, "loss": 3.7821, "step": 281 }, { "epoch": 1.3837638376383765, "grad_norm": 0.3148569358701635, "learning_rate": 0.0009700572038316744, "loss": 3.6218, "step": 282 }, { "epoch": 1.3886838868388685, "grad_norm": 0.32713215427749875, "learning_rate": 0.0009697859234183336, "loss": 3.4819, "step": 283 }, { "epoch": 1.3936039360393604, "grad_norm": 0.179744995084754, "learning_rate": 0.0009695134579266021, "loss": 3.4576, "step": 284 }, { "epoch": 1.3985239852398523, "grad_norm": 0.3224129382202447, "learning_rate": 0.000969239808043799, "loss": 3.6331, "step": 285 }, { "epoch": 1.4034440344403443, "grad_norm": 0.39076912268452235, "learning_rate": 0.0009689649744602317, "loss": 3.7079, "step": 286 }, { "epoch": 1.4083640836408364, "grad_norm": 0.4663213182491334, "learning_rate": 0.0009686889578691931, "loss": 3.7817, "step": 287 }, { "epoch": 1.4132841328413284, "grad_norm": 0.557118617245505, "learning_rate": 0.0009684117589669608, "loss": 3.642, "step": 288 }, { "epoch": 1.4182041820418205, "grad_norm": 0.6355128864996317, "learning_rate": 0.0009681333784527945, "loss": 3.5107, "step": 289 }, { "epoch": 1.4231242312423125, "grad_norm": 0.19566853206323961, "learning_rate": 0.0009678538170289347, "loss": 3.5605, "step": 290 }, { "epoch": 1.4280442804428044, "grad_norm": 0.23094041752650835, "learning_rate": 0.000967573075400601, "loss": 3.3831, "step": 291 }, { "epoch": 1.4329643296432963, "grad_norm": 0.47112027350818103, "learning_rate": 0.0009672911542759901, "loss": 3.7416, "step": 292 }, { "epoch": 1.4378843788437885, "grad_norm": 0.350160740704678, "learning_rate": 0.0009670080543662741, "loss": 3.5559, "step": 293 }, { "epoch": 1.4428044280442804, "grad_norm": 0.24808500878879194, "learning_rate": 0.0009667237763855985, "loss": 3.6448, "step": 294 }, { "epoch": 1.4477244772447724, "grad_norm": 0.4815912710354574, "learning_rate": 0.000966438321051081, "loss": 3.3488, "step": 295 }, { "epoch": 1.4526445264452645, "grad_norm": 0.2394729167732887, "learning_rate": 0.0009661516890828088, "loss": 3.7256, "step": 296 }, { "epoch": 1.4575645756457565, "grad_norm": 0.2870363689488657, "learning_rate": 0.0009658638812038378, "loss": 3.1279, "step": 297 }, { "epoch": 1.4624846248462484, "grad_norm": 0.32304585895849736, "learning_rate": 0.0009655748981401898, "loss": 3.1464, "step": 298 }, { "epoch": 1.4674046740467404, "grad_norm": 0.6189754884370978, "learning_rate": 0.0009652847406208514, "loss": 3.6041, "step": 299 }, { "epoch": 1.4723247232472325, "grad_norm": 0.22638948920788562, "learning_rate": 0.0009649934093777715, "loss": 3.589, "step": 300 }, { "epoch": 1.4772447724477245, "grad_norm": 0.21585863778420855, "learning_rate": 0.0009647009051458603, "loss": 3.5951, "step": 301 }, { "epoch": 1.4821648216482166, "grad_norm": 0.3536040437725391, "learning_rate": 0.0009644072286629867, "loss": 3.7928, "step": 302 }, { "epoch": 1.4870848708487086, "grad_norm": 0.5243346230650326, "learning_rate": 0.0009641123806699767, "loss": 3.7026, "step": 303 }, { "epoch": 1.4920049200492005, "grad_norm": 0.26749873487852593, "learning_rate": 0.0009638163619106117, "loss": 2.9453, "step": 304 }, { "epoch": 1.4969249692496924, "grad_norm": 0.5479172475912005, "learning_rate": 0.0009635191731316261, "loss": 3.3787, "step": 305 }, { "epoch": 1.5018450184501844, "grad_norm": 0.5455825445492307, "learning_rate": 0.0009632208150827064, "loss": 3.7947, "step": 306 }, { "epoch": 1.5067650676506765, "grad_norm": 0.37879011661400047, "learning_rate": 0.0009629212885164882, "loss": 3.3764, "step": 307 }, { "epoch": 1.5116851168511685, "grad_norm": 0.4979415145168532, "learning_rate": 0.0009626205941885549, "loss": 2.9422, "step": 308 }, { "epoch": 1.5166051660516606, "grad_norm": 0.28782667682795704, "learning_rate": 0.0009623187328574356, "loss": 3.8288, "step": 309 }, { "epoch": 1.5215252152521526, "grad_norm": 0.46528064339124825, "learning_rate": 0.0009620157052846037, "loss": 3.7482, "step": 310 }, { "epoch": 1.5264452644526445, "grad_norm": 0.17752123267429504, "learning_rate": 0.0009617115122344741, "loss": 3.2648, "step": 311 }, { "epoch": 1.5313653136531364, "grad_norm": 0.2875602791734218, "learning_rate": 0.0009614061544744017, "loss": 3.6375, "step": 312 }, { "epoch": 1.5362853628536284, "grad_norm": 0.24412192070944763, "learning_rate": 0.00096109963277468, "loss": 3.4487, "step": 313 }, { "epoch": 1.5412054120541205, "grad_norm": 0.16327320950209825, "learning_rate": 0.0009607919479085381, "loss": 3.316, "step": 314 }, { "epoch": 1.5461254612546127, "grad_norm": 0.24865602323088623, "learning_rate": 0.0009604831006521393, "loss": 3.7656, "step": 315 }, { "epoch": 1.5510455104551046, "grad_norm": 0.3326277641539411, "learning_rate": 0.0009601730917845796, "loss": 3.9656, "step": 316 }, { "epoch": 1.5559655596555966, "grad_norm": 0.19382910119680644, "learning_rate": 0.0009598619220878852, "loss": 3.5134, "step": 317 }, { "epoch": 1.5608856088560885, "grad_norm": 0.48608377810478826, "learning_rate": 0.00095954959234701, "loss": 3.2783, "step": 318 }, { "epoch": 1.5658056580565805, "grad_norm": 0.2543464742030402, "learning_rate": 0.0009592361033498348, "loss": 3.5159, "step": 319 }, { "epoch": 1.5707257072570726, "grad_norm": 0.18811009046084337, "learning_rate": 0.0009589214558871647, "loss": 3.5602, "step": 320 }, { "epoch": 1.5756457564575646, "grad_norm": 0.24214106586540546, "learning_rate": 0.0009586056507527265, "loss": 3.0433, "step": 321 }, { "epoch": 1.5805658056580567, "grad_norm": 0.2704162244306642, "learning_rate": 0.0009582886887431684, "loss": 3.404, "step": 322 }, { "epoch": 1.5854858548585486, "grad_norm": 0.24934433568506673, "learning_rate": 0.000957970570658056, "loss": 3.2149, "step": 323 }, { "epoch": 1.5904059040590406, "grad_norm": 0.36724513179320206, "learning_rate": 0.0009576512972998718, "loss": 3.6271, "step": 324 }, { "epoch": 1.5953259532595325, "grad_norm": 0.531717660893872, "learning_rate": 0.000957330869474012, "loss": 3.4706, "step": 325 }, { "epoch": 1.6002460024600245, "grad_norm": 0.2962376986013346, "learning_rate": 0.0009570092879887858, "loss": 3.4871, "step": 326 }, { "epoch": 1.6051660516605166, "grad_norm": 0.6325498715633567, "learning_rate": 0.0009566865536554119, "loss": 3.4106, "step": 327 }, { "epoch": 1.6100861008610086, "grad_norm": 0.324203009605223, "learning_rate": 0.0009563626672880177, "loss": 3.2789, "step": 328 }, { "epoch": 1.6150061500615007, "grad_norm": 0.2351884956584418, "learning_rate": 0.0009560376297036362, "loss": 3.0579, "step": 329 }, { "epoch": 1.6199261992619927, "grad_norm": 0.34083687176100375, "learning_rate": 0.0009557114417222051, "loss": 3.8459, "step": 330 }, { "epoch": 1.6248462484624846, "grad_norm": 0.6289716042305942, "learning_rate": 0.0009553841041665632, "loss": 3.7445, "step": 331 }, { "epoch": 1.6297662976629765, "grad_norm": 0.38023490362523016, "learning_rate": 0.0009550556178624503, "loss": 3.257, "step": 332 }, { "epoch": 1.6346863468634685, "grad_norm": 0.30997680628371055, "learning_rate": 0.000954725983638503, "loss": 3.1163, "step": 333 }, { "epoch": 1.6396063960639606, "grad_norm": 0.2582244594013547, "learning_rate": 0.0009543952023262543, "loss": 3.7255, "step": 334 }, { "epoch": 1.6445264452644528, "grad_norm": 0.43517647986356844, "learning_rate": 0.0009540632747601308, "loss": 3.7526, "step": 335 }, { "epoch": 1.6494464944649447, "grad_norm": 0.2926579676492897, "learning_rate": 0.00095373020177745, "loss": 3.5567, "step": 336 }, { "epoch": 1.6543665436654367, "grad_norm": 0.2621308688815202, "learning_rate": 0.0009533959842184194, "loss": 3.2054, "step": 337 }, { "epoch": 1.6592865928659286, "grad_norm": 0.3635331133769554, "learning_rate": 0.000953060622926134, "loss": 3.3666, "step": 338 }, { "epoch": 1.6642066420664205, "grad_norm": 0.2632708330477078, "learning_rate": 0.0009527241187465734, "loss": 3.447, "step": 339 }, { "epoch": 1.6691266912669127, "grad_norm": 0.29721203766944343, "learning_rate": 0.0009523864725286003, "loss": 3.5028, "step": 340 }, { "epoch": 1.6740467404674046, "grad_norm": 0.4242550631778796, "learning_rate": 0.0009520476851239588, "loss": 3.536, "step": 341 }, { "epoch": 1.6789667896678968, "grad_norm": 0.35014676604019396, "learning_rate": 0.0009517077573872713, "loss": 3.5745, "step": 342 }, { "epoch": 1.6838868388683887, "grad_norm": 0.30493655593585883, "learning_rate": 0.0009513666901760367, "loss": 3.2935, "step": 343 }, { "epoch": 1.6888068880688807, "grad_norm": 0.3098734985167329, "learning_rate": 0.000951024484350629, "loss": 3.4704, "step": 344 }, { "epoch": 1.6937269372693726, "grad_norm": 0.5507680150968964, "learning_rate": 0.0009506811407742937, "loss": 3.2178, "step": 345 }, { "epoch": 1.6986469864698646, "grad_norm": 0.3084593288459406, "learning_rate": 0.0009503366603131467, "loss": 3.1169, "step": 346 }, { "epoch": 1.7035670356703567, "grad_norm": 0.2581566800159827, "learning_rate": 0.0009499910438361719, "loss": 3.5805, "step": 347 }, { "epoch": 1.7084870848708487, "grad_norm": 0.7535922011535993, "learning_rate": 0.0009496442922152186, "loss": 3.0992, "step": 348 }, { "epoch": 1.7134071340713408, "grad_norm": 0.37598063878428384, "learning_rate": 0.000949296406325, "loss": 3.1782, "step": 349 }, { "epoch": 1.7183271832718328, "grad_norm": 0.20563625172743016, "learning_rate": 0.0009489473870430904, "loss": 3.9464, "step": 350 }, { "epoch": 1.7232472324723247, "grad_norm": 0.3159944255389338, "learning_rate": 0.0009485972352499231, "loss": 3.3041, "step": 351 }, { "epoch": 1.7281672816728166, "grad_norm": 0.30202738707993376, "learning_rate": 0.0009482459518287881, "loss": 3.5434, "step": 352 }, { "epoch": 1.7330873308733086, "grad_norm": 0.3022611075035952, "learning_rate": 0.0009478935376658308, "loss": 3.6283, "step": 353 }, { "epoch": 1.7380073800738007, "grad_norm": 0.2727557190500496, "learning_rate": 0.000947539993650048, "loss": 3.2763, "step": 354 }, { "epoch": 1.742927429274293, "grad_norm": 0.48273591947514927, "learning_rate": 0.0009471853206732874, "loss": 3.1452, "step": 355 }, { "epoch": 1.7478474784747848, "grad_norm": 0.40280131860818563, "learning_rate": 0.0009468295196302446, "loss": 3.5016, "step": 356 }, { "epoch": 1.7527675276752768, "grad_norm": 0.6107247814707607, "learning_rate": 0.00094647259141846, "loss": 3.1867, "step": 357 }, { "epoch": 1.7576875768757687, "grad_norm": 0.3167200103329919, "learning_rate": 0.0009461145369383183, "loss": 4.0582, "step": 358 }, { "epoch": 1.7626076260762606, "grad_norm": 0.3569312864497592, "learning_rate": 0.0009457553570930451, "loss": 3.6727, "step": 359 }, { "epoch": 1.7675276752767528, "grad_norm": 0.22806230842722597, "learning_rate": 0.0009453950527887045, "loss": 3.4306, "step": 360 }, { "epoch": 1.7724477244772447, "grad_norm": 0.6495393092314642, "learning_rate": 0.0009450336249341976, "loss": 3.1174, "step": 361 }, { "epoch": 1.777367773677737, "grad_norm": 0.21552460190827805, "learning_rate": 0.0009446710744412595, "loss": 3.6514, "step": 362 }, { "epoch": 1.7822878228782288, "grad_norm": 0.2403528330989743, "learning_rate": 0.0009443074022244572, "loss": 3.672, "step": 363 }, { "epoch": 1.7872078720787208, "grad_norm": 0.6250013969304153, "learning_rate": 0.0009439426092011876, "loss": 3.3313, "step": 364 }, { "epoch": 1.7921279212792127, "grad_norm": 0.29856049215591274, "learning_rate": 0.0009435766962916747, "loss": 3.2844, "step": 365 }, { "epoch": 1.7970479704797047, "grad_norm": 0.23244378420789105, "learning_rate": 0.0009432096644189678, "loss": 3.4367, "step": 366 }, { "epoch": 1.8019680196801968, "grad_norm": 0.33360413408041795, "learning_rate": 0.0009428415145089385, "loss": 3.3866, "step": 367 }, { "epoch": 1.8068880688806888, "grad_norm": 0.33383270397914105, "learning_rate": 0.000942472247490279, "loss": 3.0405, "step": 368 }, { "epoch": 1.811808118081181, "grad_norm": 0.2751034357977177, "learning_rate": 0.0009421018642944997, "loss": 3.1805, "step": 369 }, { "epoch": 1.8167281672816729, "grad_norm": 0.49292210462809727, "learning_rate": 0.000941730365855926, "loss": 3.813, "step": 370 }, { "epoch": 1.8216482164821648, "grad_norm": 0.23600023505704082, "learning_rate": 0.0009413577531116972, "loss": 3.3936, "step": 371 }, { "epoch": 1.8265682656826567, "grad_norm": 0.2048406504192221, "learning_rate": 0.0009409840270017635, "loss": 3.4877, "step": 372 }, { "epoch": 1.8314883148831487, "grad_norm": 0.3018995098729513, "learning_rate": 0.0009406091884688836, "loss": 3.1108, "step": 373 }, { "epoch": 1.8364083640836408, "grad_norm": 0.2475858853969993, "learning_rate": 0.000940233238458622, "loss": 3.7047, "step": 374 }, { "epoch": 1.841328413284133, "grad_norm": 0.18776694009080297, "learning_rate": 0.0009398561779193477, "loss": 3.6018, "step": 375 }, { "epoch": 1.846248462484625, "grad_norm": 0.3980479714166502, "learning_rate": 0.0009394780078022304, "loss": 3.5543, "step": 376 }, { "epoch": 1.8511685116851169, "grad_norm": 0.22848545791843036, "learning_rate": 0.0009390987290612395, "loss": 3.2808, "step": 377 }, { "epoch": 1.8560885608856088, "grad_norm": 0.18984471320600155, "learning_rate": 0.0009387183426531403, "loss": 3.4678, "step": 378 }, { "epoch": 1.8610086100861007, "grad_norm": 0.36752058676203875, "learning_rate": 0.000938336849537493, "loss": 3.0392, "step": 379 }, { "epoch": 1.865928659286593, "grad_norm": 0.2108640675390431, "learning_rate": 0.000937954250676649, "loss": 3.2487, "step": 380 }, { "epoch": 1.8708487084870848, "grad_norm": 0.29128177492257173, "learning_rate": 0.0009375705470357493, "loss": 3.2222, "step": 381 }, { "epoch": 1.875768757687577, "grad_norm": 0.25282535702882175, "learning_rate": 0.0009371857395827218, "loss": 3.1457, "step": 382 }, { "epoch": 1.880688806888069, "grad_norm": 0.24034662337771245, "learning_rate": 0.0009367998292882789, "loss": 3.1859, "step": 383 }, { "epoch": 1.8856088560885609, "grad_norm": 0.25357270024036144, "learning_rate": 0.000936412817125915, "loss": 3.435, "step": 384 }, { "epoch": 1.8905289052890528, "grad_norm": 0.9125320667075362, "learning_rate": 0.0009360247040719039, "loss": 3.7041, "step": 385 }, { "epoch": 1.8954489544895448, "grad_norm": 0.21175151025763722, "learning_rate": 0.0009356354911052967, "loss": 2.989, "step": 386 }, { "epoch": 1.900369003690037, "grad_norm": 0.3808158786073021, "learning_rate": 0.0009352451792079189, "loss": 3.1136, "step": 387 }, { "epoch": 1.9052890528905289, "grad_norm": 0.4002405716566133, "learning_rate": 0.0009348537693643686, "loss": 3.3744, "step": 388 }, { "epoch": 1.910209102091021, "grad_norm": 0.3999462436218137, "learning_rate": 0.0009344612625620133, "loss": 3.7013, "step": 389 }, { "epoch": 1.915129151291513, "grad_norm": 0.24815270393370573, "learning_rate": 0.0009340676597909874, "loss": 2.7016, "step": 390 }, { "epoch": 1.920049200492005, "grad_norm": 0.2631214038178891, "learning_rate": 0.0009336729620441905, "loss": 3.3711, "step": 391 }, { "epoch": 1.9249692496924968, "grad_norm": 0.6066375501836473, "learning_rate": 0.000933277170317284, "loss": 3.5241, "step": 392 }, { "epoch": 1.9298892988929888, "grad_norm": 0.19828746495156796, "learning_rate": 0.000932880285608689, "loss": 2.8406, "step": 393 }, { "epoch": 1.934809348093481, "grad_norm": 0.5106249564719965, "learning_rate": 0.0009324823089195839, "loss": 3.1493, "step": 394 }, { "epoch": 1.939729397293973, "grad_norm": 0.18780891934762423, "learning_rate": 0.000932083241253902, "loss": 3.476, "step": 395 }, { "epoch": 1.944649446494465, "grad_norm": 0.3347574823761304, "learning_rate": 0.0009316830836183279, "loss": 3.1557, "step": 396 }, { "epoch": 1.949569495694957, "grad_norm": 0.38207464156397103, "learning_rate": 0.0009312818370222961, "loss": 3.4673, "step": 397 }, { "epoch": 1.954489544895449, "grad_norm": 0.6973593451136899, "learning_rate": 0.0009308795024779889, "loss": 3.8604, "step": 398 }, { "epoch": 1.9594095940959408, "grad_norm": 0.4600445052176774, "learning_rate": 0.0009304760810003317, "loss": 3.5294, "step": 399 }, { "epoch": 1.964329643296433, "grad_norm": 0.3086455645166654, "learning_rate": 0.0009300715736069929, "loss": 3.1741, "step": 400 }, { "epoch": 1.969249692496925, "grad_norm": 0.25835608222267886, "learning_rate": 0.0009296659813183793, "loss": 3.0548, "step": 401 }, { "epoch": 1.974169741697417, "grad_norm": 0.22614790387111547, "learning_rate": 0.0009292593051576353, "loss": 3.2945, "step": 402 }, { "epoch": 1.979089790897909, "grad_norm": 0.2907192351628518, "learning_rate": 0.0009288515461506388, "loss": 2.9656, "step": 403 }, { "epoch": 1.984009840098401, "grad_norm": 0.26487838548839593, "learning_rate": 0.000928442705326, "loss": 3.3745, "step": 404 }, { "epoch": 1.988929889298893, "grad_norm": 0.1702509548485145, "learning_rate": 0.000928032783715057, "loss": 3.2961, "step": 405 }, { "epoch": 1.9938499384993849, "grad_norm": 0.2534805670323297, "learning_rate": 0.0009276217823518756, "loss": 3.145, "step": 406 }, { "epoch": 1.998769987699877, "grad_norm": 0.17845301826089346, "learning_rate": 0.0009272097022732444, "loss": 3.1244, "step": 407 }, { "epoch": 2.0, "grad_norm": 0.17845301826089346, "learning_rate": 0.0009267965445186732, "loss": 0.9878, "step": 408 }, { "epoch": 2.004920049200492, "grad_norm": 0.32430141611848995, "learning_rate": 0.000926382310130391, "loss": 3.3611, "step": 409 }, { "epoch": 2.009840098400984, "grad_norm": 0.33463033809020054, "learning_rate": 0.0009259670001533423, "loss": 3.5593, "step": 410 }, { "epoch": 2.014760147601476, "grad_norm": 0.2012307985868391, "learning_rate": 0.0009255506156351846, "loss": 3.619, "step": 411 }, { "epoch": 2.019680196801968, "grad_norm": 0.3870363134910316, "learning_rate": 0.0009251331576262866, "loss": 3.3184, "step": 412 }, { "epoch": 2.02460024600246, "grad_norm": 0.371318856309328, "learning_rate": 0.0009247146271797245, "loss": 3.3962, "step": 413 }, { "epoch": 2.029520295202952, "grad_norm": 0.3038638593760832, "learning_rate": 0.0009242950253512801, "loss": 3.0326, "step": 414 }, { "epoch": 2.034440344403444, "grad_norm": 0.3088234205497124, "learning_rate": 0.0009238743531994378, "loss": 3.6699, "step": 415 }, { "epoch": 2.039360393603936, "grad_norm": 0.26990025659109146, "learning_rate": 0.0009234526117853819, "loss": 3.4569, "step": 416 }, { "epoch": 2.044280442804428, "grad_norm": 0.638498321277001, "learning_rate": 0.0009230298021729941, "loss": 3.4258, "step": 417 }, { "epoch": 2.0492004920049203, "grad_norm": 0.31170587645514647, "learning_rate": 0.0009226059254288506, "loss": 3.2613, "step": 418 }, { "epoch": 2.054120541205412, "grad_norm": 0.21867222918249765, "learning_rate": 0.0009221809826222198, "loss": 3.5817, "step": 419 }, { "epoch": 2.059040590405904, "grad_norm": 0.19137931154333443, "learning_rate": 0.0009217549748250593, "loss": 3.3914, "step": 420 }, { "epoch": 2.063960639606396, "grad_norm": 0.21214882208713806, "learning_rate": 0.0009213279031120129, "loss": 3.6677, "step": 421 }, { "epoch": 2.068880688806888, "grad_norm": 0.22671933120914375, "learning_rate": 0.0009208997685604083, "loss": 3.1606, "step": 422 }, { "epoch": 2.07380073800738, "grad_norm": 0.3525550742657656, "learning_rate": 0.0009204705722502549, "loss": 3.3285, "step": 423 }, { "epoch": 2.078720787207872, "grad_norm": 0.5919495728226617, "learning_rate": 0.0009200403152642399, "loss": 3.3257, "step": 424 }, { "epoch": 2.0836408364083643, "grad_norm": 0.7414796152266768, "learning_rate": 0.0009196089986877261, "loss": 3.1177, "step": 425 }, { "epoch": 2.088560885608856, "grad_norm": 0.207762777841323, "learning_rate": 0.0009191766236087493, "loss": 3.8463, "step": 426 }, { "epoch": 2.093480934809348, "grad_norm": 0.3252463961438015, "learning_rate": 0.0009187431911180159, "loss": 3.0986, "step": 427 }, { "epoch": 2.09840098400984, "grad_norm": 0.3227911972426935, "learning_rate": 0.0009183087023088995, "loss": 3.5271, "step": 428 }, { "epoch": 2.103321033210332, "grad_norm": 0.48195270377395966, "learning_rate": 0.000917873158277438, "loss": 3.322, "step": 429 }, { "epoch": 2.108241082410824, "grad_norm": 0.3136333863551474, "learning_rate": 0.0009174365601223313, "loss": 3.9275, "step": 430 }, { "epoch": 2.113161131611316, "grad_norm": 0.2414052715153622, "learning_rate": 0.000916998908944939, "loss": 3.6049, "step": 431 }, { "epoch": 2.1180811808118083, "grad_norm": 0.45621932184948716, "learning_rate": 0.0009165602058492764, "loss": 2.9191, "step": 432 }, { "epoch": 2.1230012300123002, "grad_norm": 0.3422350420543672, "learning_rate": 0.0009161204519420125, "loss": 3.3338, "step": 433 }, { "epoch": 2.127921279212792, "grad_norm": 0.21600427862346466, "learning_rate": 0.0009156796483324672, "loss": 3.4246, "step": 434 }, { "epoch": 2.132841328413284, "grad_norm": 0.6703130064611811, "learning_rate": 0.0009152377961326085, "loss": 3.2079, "step": 435 }, { "epoch": 2.137761377613776, "grad_norm": 0.19256285757915917, "learning_rate": 0.0009147948964570492, "loss": 3.0555, "step": 436 }, { "epoch": 2.142681426814268, "grad_norm": 0.18420379726193872, "learning_rate": 0.0009143509504230448, "loss": 3.5166, "step": 437 }, { "epoch": 2.14760147601476, "grad_norm": 0.33398930674654564, "learning_rate": 0.0009139059591504902, "loss": 3.478, "step": 438 }, { "epoch": 2.1525215252152523, "grad_norm": 0.39359023035542823, "learning_rate": 0.0009134599237619167, "loss": 3.5699, "step": 439 }, { "epoch": 2.1574415744157442, "grad_norm": 0.24300145646379445, "learning_rate": 0.00091301284538249, "loss": 3.2955, "step": 440 }, { "epoch": 2.162361623616236, "grad_norm": 0.24570868265457563, "learning_rate": 0.0009125647251400067, "loss": 3.3857, "step": 441 }, { "epoch": 2.167281672816728, "grad_norm": 0.23932916468026139, "learning_rate": 0.0009121155641648913, "loss": 3.741, "step": 442 }, { "epoch": 2.17220172201722, "grad_norm": 0.19261545209745753, "learning_rate": 0.000911665363590194, "loss": 2.9056, "step": 443 }, { "epoch": 2.177121771217712, "grad_norm": 0.1976423260962655, "learning_rate": 0.0009112141245515873, "loss": 3.5433, "step": 444 }, { "epoch": 2.1820418204182044, "grad_norm": 0.15301925309500666, "learning_rate": 0.0009107618481873632, "loss": 3.2257, "step": 445 }, { "epoch": 2.1869618696186963, "grad_norm": 0.23110766241733663, "learning_rate": 0.0009103085356384308, "loss": 3.2029, "step": 446 }, { "epoch": 2.1918819188191883, "grad_norm": 0.2226445503919321, "learning_rate": 0.0009098541880483129, "loss": 3.5251, "step": 447 }, { "epoch": 2.19680196801968, "grad_norm": 0.2999843032377674, "learning_rate": 0.0009093988065631432, "loss": 3.4932, "step": 448 }, { "epoch": 2.201722017220172, "grad_norm": 0.19362921027748428, "learning_rate": 0.0009089423923316635, "loss": 3.0706, "step": 449 }, { "epoch": 2.206642066420664, "grad_norm": 0.19755669245060017, "learning_rate": 0.000908484946505221, "loss": 3.5362, "step": 450 }, { "epoch": 2.211562115621156, "grad_norm": 0.19531848761091766, "learning_rate": 0.000908026470237765, "loss": 3.3511, "step": 451 }, { "epoch": 2.2164821648216484, "grad_norm": 0.32041917959568916, "learning_rate": 0.0009075669646858442, "loss": 3.51, "step": 452 }, { "epoch": 2.2214022140221403, "grad_norm": 0.3799546383866389, "learning_rate": 0.0009071064310086039, "loss": 3.2676, "step": 453 }, { "epoch": 2.2263222632226323, "grad_norm": 0.2847254415900049, "learning_rate": 0.0009066448703677828, "loss": 2.8399, "step": 454 }, { "epoch": 2.231242312423124, "grad_norm": 0.19889348632490925, "learning_rate": 0.0009061822839277101, "loss": 3.2658, "step": 455 }, { "epoch": 2.236162361623616, "grad_norm": 0.26445515016487203, "learning_rate": 0.000905718672855303, "loss": 3.459, "step": 456 }, { "epoch": 2.241082410824108, "grad_norm": 0.24475431990377403, "learning_rate": 0.0009052540383200632, "loss": 3.2664, "step": 457 }, { "epoch": 2.2460024600246005, "grad_norm": 0.3826504966928167, "learning_rate": 0.0009047883814940743, "loss": 3.5855, "step": 458 }, { "epoch": 2.2509225092250924, "grad_norm": 0.3714799694142234, "learning_rate": 0.0009043217035519986, "loss": 3.0865, "step": 459 }, { "epoch": 2.2558425584255843, "grad_norm": 0.2508426058649485, "learning_rate": 0.0009038540056710744, "loss": 3.1127, "step": 460 }, { "epoch": 2.2607626076260763, "grad_norm": 0.43574249788791136, "learning_rate": 0.0009033852890311128, "loss": 3.9167, "step": 461 }, { "epoch": 2.265682656826568, "grad_norm": 0.16550488618675252, "learning_rate": 0.0009029155548144945, "loss": 2.7448, "step": 462 }, { "epoch": 2.27060270602706, "grad_norm": 0.20937705440935458, "learning_rate": 0.0009024448042061679, "loss": 3.4577, "step": 463 }, { "epoch": 2.275522755227552, "grad_norm": 0.34076299901510754, "learning_rate": 0.0009019730383936448, "loss": 3.6613, "step": 464 }, { "epoch": 2.280442804428044, "grad_norm": 0.4305275515903465, "learning_rate": 0.0009015002585669981, "loss": 3.5464, "step": 465 }, { "epoch": 2.2853628536285364, "grad_norm": 0.17567075251913614, "learning_rate": 0.0009010264659188582, "loss": 4.0182, "step": 466 }, { "epoch": 2.2902829028290284, "grad_norm": 0.33455246764417906, "learning_rate": 0.0009005516616444111, "loss": 3.4328, "step": 467 }, { "epoch": 2.2952029520295203, "grad_norm": 1.6766060068258941, "learning_rate": 0.0009000758469413946, "loss": 3.2666, "step": 468 }, { "epoch": 2.3001230012300122, "grad_norm": 0.2215626688869319, "learning_rate": 0.0008995990230100949, "loss": 3.2614, "step": 469 }, { "epoch": 2.305043050430504, "grad_norm": 0.2233171403603573, "learning_rate": 0.0008991211910533447, "loss": 3.3527, "step": 470 }, { "epoch": 2.3099630996309966, "grad_norm": 0.30528663358529007, "learning_rate": 0.000898642352276519, "loss": 3.7163, "step": 471 }, { "epoch": 2.3148831488314885, "grad_norm": 0.3418714747422929, "learning_rate": 0.0008981625078875332, "loss": 3.5909, "step": 472 }, { "epoch": 2.3198031980319804, "grad_norm": 0.21716592071407126, "learning_rate": 0.0008976816590968387, "loss": 3.3332, "step": 473 }, { "epoch": 2.3247232472324724, "grad_norm": 0.18315853563422138, "learning_rate": 0.0008971998071174213, "loss": 3.3687, "step": 474 }, { "epoch": 2.3296432964329643, "grad_norm": 0.1624918701658956, "learning_rate": 0.000896716953164797, "loss": 3.1884, "step": 475 }, { "epoch": 2.3345633456334562, "grad_norm": 0.2948657575318485, "learning_rate": 0.0008962330984570095, "loss": 3.0111, "step": 476 }, { "epoch": 2.339483394833948, "grad_norm": 0.274521420195062, "learning_rate": 0.0008957482442146271, "loss": 4.0939, "step": 477 }, { "epoch": 2.34440344403444, "grad_norm": 0.1546110380843756, "learning_rate": 0.0008952623916607392, "loss": 3.5462, "step": 478 }, { "epoch": 2.3493234932349325, "grad_norm": 0.19923869532474567, "learning_rate": 0.000894775542020954, "loss": 4.0528, "step": 479 }, { "epoch": 2.3542435424354244, "grad_norm": 0.2321181689826377, "learning_rate": 0.0008942876965233946, "loss": 3.3518, "step": 480 }, { "epoch": 2.3591635916359164, "grad_norm": 0.817802868964042, "learning_rate": 0.0008937988563986963, "loss": 3.3047, "step": 481 }, { "epoch": 2.3640836408364083, "grad_norm": 0.6157727828803585, "learning_rate": 0.0008933090228800033, "loss": 3.3504, "step": 482 }, { "epoch": 2.3690036900369003, "grad_norm": 0.23835322115747237, "learning_rate": 0.0008928181972029663, "loss": 3.4633, "step": 483 }, { "epoch": 2.373923739237392, "grad_norm": 0.21552157171597958, "learning_rate": 0.0008923263806057383, "loss": 3.3535, "step": 484 }, { "epoch": 2.3788437884378846, "grad_norm": 0.28953306156137, "learning_rate": 0.0008918335743289717, "loss": 3.1705, "step": 485 }, { "epoch": 2.3837638376383765, "grad_norm": 0.33078357256665103, "learning_rate": 0.0008913397796158162, "loss": 3.6369, "step": 486 }, { "epoch": 2.3886838868388685, "grad_norm": 0.2907758583103, "learning_rate": 0.0008908449977119148, "loss": 3.3086, "step": 487 }, { "epoch": 2.3936039360393604, "grad_norm": 0.31197013644111454, "learning_rate": 0.0008903492298654002, "loss": 3.3703, "step": 488 }, { "epoch": 2.3985239852398523, "grad_norm": 0.28730596658805196, "learning_rate": 0.0008898524773268925, "loss": 3.146, "step": 489 }, { "epoch": 2.4034440344403443, "grad_norm": 0.3491779203193017, "learning_rate": 0.0008893547413494962, "loss": 3.1335, "step": 490 }, { "epoch": 2.408364083640836, "grad_norm": 0.19192299139498983, "learning_rate": 0.0008888560231887963, "loss": 3.3841, "step": 491 }, { "epoch": 2.4132841328413286, "grad_norm": 0.28492446202747346, "learning_rate": 0.0008883563241028551, "loss": 3.1354, "step": 492 }, { "epoch": 2.4182041820418205, "grad_norm": 0.31592585945098633, "learning_rate": 0.0008878556453522099, "loss": 3.2418, "step": 493 }, { "epoch": 2.4231242312423125, "grad_norm": 0.3160635879918218, "learning_rate": 0.0008873539881998692, "loss": 3.3456, "step": 494 }, { "epoch": 2.4280442804428044, "grad_norm": 0.22622879963224923, "learning_rate": 0.0008868513539113092, "loss": 3.1366, "step": 495 }, { "epoch": 2.4329643296432963, "grad_norm": 0.2413949810566293, "learning_rate": 0.0008863477437544718, "loss": 2.9125, "step": 496 }, { "epoch": 2.4378843788437883, "grad_norm": 1.5053779612179796, "learning_rate": 0.0008858431589997596, "loss": 2.5845, "step": 497 }, { "epoch": 2.4428044280442807, "grad_norm": 0.23668222680508744, "learning_rate": 0.0008853376009200347, "loss": 3.6546, "step": 498 }, { "epoch": 2.4477244772447726, "grad_norm": 0.31263200768647026, "learning_rate": 0.0008848310707906138, "loss": 3.5673, "step": 499 }, { "epoch": 2.4526445264452645, "grad_norm": 0.5443651106055652, "learning_rate": 0.000884323569889266, "loss": 3.4131, "step": 500 }, { "epoch": 2.4575645756457565, "grad_norm": 0.2557265616141521, "learning_rate": 0.0008838150994962093, "loss": 3.5581, "step": 501 }, { "epoch": 2.4624846248462484, "grad_norm": 0.454101179413854, "learning_rate": 0.0008833056608941073, "loss": 2.5321, "step": 502 }, { "epoch": 2.4674046740467404, "grad_norm": 0.37262232296799547, "learning_rate": 0.0008827952553680656, "loss": 3.6133, "step": 503 }, { "epoch": 2.4723247232472323, "grad_norm": 0.6036923693936145, "learning_rate": 0.0008822838842056296, "loss": 3.446, "step": 504 }, { "epoch": 2.4772447724477242, "grad_norm": 0.5456474107186624, "learning_rate": 0.0008817715486967802, "loss": 3.2691, "step": 505 }, { "epoch": 2.4821648216482166, "grad_norm": 0.48970213661367795, "learning_rate": 0.000881258250133931, "loss": 3.5089, "step": 506 }, { "epoch": 2.4870848708487086, "grad_norm": 0.34942167599420426, "learning_rate": 0.0008807439898119252, "loss": 2.9334, "step": 507 }, { "epoch": 2.4920049200492005, "grad_norm": 0.261339350655468, "learning_rate": 0.0008802287690280318, "loss": 3.2115, "step": 508 }, { "epoch": 2.4969249692496924, "grad_norm": 0.3411542441316105, "learning_rate": 0.0008797125890819429, "loss": 3.3798, "step": 509 }, { "epoch": 2.5018450184501844, "grad_norm": 0.36955155828864694, "learning_rate": 0.00087919545127577, "loss": 3.5252, "step": 510 }, { "epoch": 2.5067650676506767, "grad_norm": 0.3801722350165069, "learning_rate": 0.0008786773569140413, "loss": 2.5998, "step": 511 }, { "epoch": 2.5116851168511687, "grad_norm": 0.33177582464378025, "learning_rate": 0.0008781583073036973, "loss": 3.5139, "step": 512 }, { "epoch": 2.5166051660516606, "grad_norm": 0.47116747721442, "learning_rate": 0.0008776383037540888, "loss": 3.4299, "step": 513 }, { "epoch": 2.5215252152521526, "grad_norm": 0.3243956466835861, "learning_rate": 0.0008771173475769727, "loss": 2.9008, "step": 514 }, { "epoch": 2.5264452644526445, "grad_norm": 0.23453967301344925, "learning_rate": 0.0008765954400865092, "loss": 2.8954, "step": 515 }, { "epoch": 2.5313653136531364, "grad_norm": 0.2982622501208373, "learning_rate": 0.000876072582599258, "loss": 3.4442, "step": 516 }, { "epoch": 2.5362853628536284, "grad_norm": 0.25501989401450953, "learning_rate": 0.0008755487764341756, "loss": 2.8853, "step": 517 }, { "epoch": 2.5412054120541203, "grad_norm": 0.23530885998016376, "learning_rate": 0.0008750240229126113, "loss": 3.4507, "step": 518 }, { "epoch": 2.5461254612546127, "grad_norm": 0.4509565818675211, "learning_rate": 0.0008744983233583044, "loss": 3.1532, "step": 519 }, { "epoch": 2.5510455104551046, "grad_norm": 0.21067151280102675, "learning_rate": 0.0008739716790973806, "loss": 3.6416, "step": 520 }, { "epoch": 2.5559655596555966, "grad_norm": 0.5861474468561233, "learning_rate": 0.0008734440914583486, "loss": 3.2536, "step": 521 }, { "epoch": 2.5608856088560885, "grad_norm": 0.2203889494813628, "learning_rate": 0.0008729155617720971, "loss": 3.7557, "step": 522 }, { "epoch": 2.5658056580565805, "grad_norm": 0.28263147589887444, "learning_rate": 0.0008723860913718909, "loss": 3.6516, "step": 523 }, { "epoch": 2.570725707257073, "grad_norm": 0.34399692242828495, "learning_rate": 0.0008718556815933683, "loss": 2.8382, "step": 524 }, { "epoch": 2.5756457564575648, "grad_norm": 0.5772447908626964, "learning_rate": 0.0008713243337745365, "loss": 3.5155, "step": 525 }, { "epoch": 2.5805658056580567, "grad_norm": 0.38057446150502355, "learning_rate": 0.0008707920492557697, "loss": 3.272, "step": 526 }, { "epoch": 2.5854858548585486, "grad_norm": 0.3309880891835743, "learning_rate": 0.000870258829379805, "loss": 3.1941, "step": 527 }, { "epoch": 2.5904059040590406, "grad_norm": 0.2515232042303292, "learning_rate": 0.0008697246754917385, "loss": 3.5974, "step": 528 }, { "epoch": 2.5953259532595325, "grad_norm": 0.3064825186601961, "learning_rate": 0.0008691895889390227, "loss": 3.3187, "step": 529 }, { "epoch": 2.6002460024600245, "grad_norm": 0.24532300537923452, "learning_rate": 0.0008686535710714631, "loss": 3.4014, "step": 530 }, { "epoch": 2.6051660516605164, "grad_norm": 0.2405531355186871, "learning_rate": 0.0008681166232412141, "loss": 3.7837, "step": 531 }, { "epoch": 2.6100861008610083, "grad_norm": 0.24980579588588908, "learning_rate": 0.0008675787468027762, "loss": 3.8081, "step": 532 }, { "epoch": 2.6150061500615007, "grad_norm": 0.31408879503188586, "learning_rate": 0.0008670399431129925, "loss": 3.1509, "step": 533 }, { "epoch": 2.6199261992619927, "grad_norm": 0.28132775230209117, "learning_rate": 0.0008665002135310451, "loss": 3.5714, "step": 534 }, { "epoch": 2.6248462484624846, "grad_norm": 1.209181208474014, "learning_rate": 0.0008659595594184516, "loss": 3.6863, "step": 535 }, { "epoch": 2.6297662976629765, "grad_norm": 0.2290602867464935, "learning_rate": 0.0008654179821390621, "loss": 3.4864, "step": 536 }, { "epoch": 2.6346863468634685, "grad_norm": 0.4232790246881263, "learning_rate": 0.0008648754830590552, "loss": 3.3434, "step": 537 }, { "epoch": 2.639606396063961, "grad_norm": 0.3186299509733485, "learning_rate": 0.000864332063546935, "loss": 3.7189, "step": 538 }, { "epoch": 2.644526445264453, "grad_norm": 0.25168558651643325, "learning_rate": 0.0008637877249735273, "loss": 3.45, "step": 539 }, { "epoch": 2.6494464944649447, "grad_norm": 0.274106424167992, "learning_rate": 0.0008632424687119767, "loss": 3.1486, "step": 540 }, { "epoch": 2.6543665436654367, "grad_norm": 0.6021148781344842, "learning_rate": 0.0008626962961377423, "loss": 3.5319, "step": 541 }, { "epoch": 2.6592865928659286, "grad_norm": 0.2403818627662489, "learning_rate": 0.000862149208628595, "loss": 2.8376, "step": 542 }, { "epoch": 2.6642066420664205, "grad_norm": 0.4157571304417629, "learning_rate": 0.0008616012075646133, "loss": 3.1362, "step": 543 }, { "epoch": 2.6691266912669125, "grad_norm": 0.17790372855359163, "learning_rate": 0.0008610522943281808, "loss": 3.3051, "step": 544 }, { "epoch": 2.6740467404674044, "grad_norm": 0.358425076053947, "learning_rate": 0.0008605024703039817, "loss": 2.4171, "step": 545 }, { "epoch": 2.678966789667897, "grad_norm": 0.3147891326612052, "learning_rate": 0.000859951736878998, "loss": 3.219, "step": 546 }, { "epoch": 2.6838868388683887, "grad_norm": 0.25409927848669495, "learning_rate": 0.0008594000954425055, "loss": 3.0446, "step": 547 }, { "epoch": 2.6888068880688807, "grad_norm": 0.5599148838047568, "learning_rate": 0.0008588475473860709, "loss": 3.71, "step": 548 }, { "epoch": 2.6937269372693726, "grad_norm": 0.3916984757152536, "learning_rate": 0.0008582940941035476, "loss": 3.596, "step": 549 }, { "epoch": 2.6986469864698646, "grad_norm": 0.36545126176667847, "learning_rate": 0.0008577397369910723, "loss": 3.3018, "step": 550 }, { "epoch": 2.703567035670357, "grad_norm": 0.508604926230333, "learning_rate": 0.0008571844774470627, "loss": 3.5051, "step": 551 }, { "epoch": 2.708487084870849, "grad_norm": 0.4937201864383132, "learning_rate": 0.0008566283168722117, "loss": 2.8695, "step": 552 }, { "epoch": 2.713407134071341, "grad_norm": 0.27733557695661853, "learning_rate": 0.0008560712566694861, "loss": 3.275, "step": 553 }, { "epoch": 2.7183271832718328, "grad_norm": 0.2615670597918371, "learning_rate": 0.0008555132982441216, "loss": 3.2578, "step": 554 }, { "epoch": 2.7232472324723247, "grad_norm": 0.4913593426710281, "learning_rate": 0.0008549544430036198, "loss": 3.0128, "step": 555 }, { "epoch": 2.7281672816728166, "grad_norm": 0.1770171319204203, "learning_rate": 0.0008543946923577448, "loss": 3.1222, "step": 556 }, { "epoch": 2.7330873308733086, "grad_norm": 0.2028596416704059, "learning_rate": 0.0008538340477185192, "loss": 3.3536, "step": 557 }, { "epoch": 2.7380073800738005, "grad_norm": 0.31805827498360056, "learning_rate": 0.0008532725105002211, "loss": 3.2489, "step": 558 }, { "epoch": 2.742927429274293, "grad_norm": 0.2893983685144155, "learning_rate": 0.0008527100821193797, "loss": 2.9376, "step": 559 }, { "epoch": 2.747847478474785, "grad_norm": 0.2784987310620702, "learning_rate": 0.0008521467639947725, "loss": 3.4043, "step": 560 }, { "epoch": 2.7527675276752768, "grad_norm": 0.4300678254880063, "learning_rate": 0.0008515825575474221, "loss": 3.4357, "step": 561 }, { "epoch": 2.7576875768757687, "grad_norm": 0.709095362374269, "learning_rate": 0.0008510174642005908, "loss": 3.7041, "step": 562 }, { "epoch": 2.7626076260762606, "grad_norm": 0.2947626818919257, "learning_rate": 0.0008504514853797789, "loss": 3.3877, "step": 563 }, { "epoch": 2.767527675276753, "grad_norm": 0.19036937829228529, "learning_rate": 0.0008498846225127204, "loss": 3.2371, "step": 564 }, { "epoch": 2.772447724477245, "grad_norm": 0.6027518704448109, "learning_rate": 0.0008493168770293792, "loss": 2.8922, "step": 565 }, { "epoch": 2.777367773677737, "grad_norm": 0.2508070772451908, "learning_rate": 0.0008487482503619457, "loss": 3.1827, "step": 566 }, { "epoch": 2.782287822878229, "grad_norm": 0.2465472789361326, "learning_rate": 0.0008481787439448331, "loss": 3.1101, "step": 567 }, { "epoch": 2.787207872078721, "grad_norm": 0.3665421791304747, "learning_rate": 0.0008476083592146742, "loss": 3.6415, "step": 568 }, { "epoch": 2.7921279212792127, "grad_norm": 0.252453591544854, "learning_rate": 0.000847037097610317, "loss": 2.2868, "step": 569 }, { "epoch": 2.7970479704797047, "grad_norm": 0.26337135335532563, "learning_rate": 0.0008464649605728216, "loss": 3.5153, "step": 570 }, { "epoch": 2.8019680196801966, "grad_norm": 0.18813366542571497, "learning_rate": 0.0008458919495454567, "loss": 3.303, "step": 571 }, { "epoch": 2.8068880688806885, "grad_norm": 0.37648412433416484, "learning_rate": 0.0008453180659736954, "loss": 3.1171, "step": 572 }, { "epoch": 2.811808118081181, "grad_norm": 0.3137746621286986, "learning_rate": 0.0008447433113052123, "loss": 3.5273, "step": 573 }, { "epoch": 2.816728167281673, "grad_norm": 0.2955367758452444, "learning_rate": 0.0008441676869898789, "loss": 3.1837, "step": 574 }, { "epoch": 2.821648216482165, "grad_norm": 1.7217184072616605, "learning_rate": 0.0008435911944797605, "loss": 3.7555, "step": 575 }, { "epoch": 2.8265682656826567, "grad_norm": 0.3214979231477616, "learning_rate": 0.0008430138352291132, "loss": 3.556, "step": 576 }, { "epoch": 2.8314883148831487, "grad_norm": 0.25907815602013456, "learning_rate": 0.000842435610694379, "loss": 3.1328, "step": 577 }, { "epoch": 2.836408364083641, "grad_norm": 0.6039768287732341, "learning_rate": 0.0008418565223341821, "loss": 3.3616, "step": 578 }, { "epoch": 2.841328413284133, "grad_norm": 0.5415614473408078, "learning_rate": 0.0008412765716093271, "loss": 3.5141, "step": 579 }, { "epoch": 2.846248462484625, "grad_norm": 0.19847167672398833, "learning_rate": 0.0008406957599827929, "loss": 3.3241, "step": 580 }, { "epoch": 2.851168511685117, "grad_norm": 0.42280714325900504, "learning_rate": 0.0008401140889197305, "loss": 3.2316, "step": 581 }, { "epoch": 2.856088560885609, "grad_norm": 0.31993519526219644, "learning_rate": 0.0008395315598874591, "loss": 3.6124, "step": 582 }, { "epoch": 2.8610086100861007, "grad_norm": 0.23237122298851928, "learning_rate": 0.0008389481743554619, "loss": 3.1363, "step": 583 }, { "epoch": 2.8659286592865927, "grad_norm": 0.292812376682415, "learning_rate": 0.0008383639337953827, "loss": 2.3609, "step": 584 }, { "epoch": 2.8708487084870846, "grad_norm": 0.1728851614913819, "learning_rate": 0.0008377788396810223, "loss": 3.1067, "step": 585 }, { "epoch": 2.875768757687577, "grad_norm": 0.45069653414294575, "learning_rate": 0.0008371928934883348, "loss": 2.7594, "step": 586 }, { "epoch": 2.880688806888069, "grad_norm": 0.21391612052156136, "learning_rate": 0.0008366060966954234, "loss": 2.961, "step": 587 }, { "epoch": 2.885608856088561, "grad_norm": 0.4574940605101431, "learning_rate": 0.0008360184507825375, "loss": 3.1701, "step": 588 }, { "epoch": 2.890528905289053, "grad_norm": 0.48441507040196996, "learning_rate": 0.0008354299572320679, "loss": 3.258, "step": 589 }, { "epoch": 2.8954489544895448, "grad_norm": 0.5446545129178969, "learning_rate": 0.0008348406175285439, "loss": 3.3981, "step": 590 }, { "epoch": 2.900369003690037, "grad_norm": 0.22673441328672134, "learning_rate": 0.0008342504331586298, "loss": 2.9428, "step": 591 }, { "epoch": 2.905289052890529, "grad_norm": 0.42658398990645574, "learning_rate": 0.0008336594056111197, "loss": 3.2077, "step": 592 }, { "epoch": 2.910209102091021, "grad_norm": 0.3509947588446193, "learning_rate": 0.0008330675363769355, "loss": 3.405, "step": 593 }, { "epoch": 2.915129151291513, "grad_norm": 0.26281196000888585, "learning_rate": 0.0008324748269491221, "loss": 2.954, "step": 594 }, { "epoch": 2.920049200492005, "grad_norm": 0.44620697929468445, "learning_rate": 0.0008318812788228432, "loss": 3.4603, "step": 595 }, { "epoch": 2.924969249692497, "grad_norm": 0.30881770772159967, "learning_rate": 0.0008312868934953794, "loss": 3.6134, "step": 596 }, { "epoch": 2.9298892988929888, "grad_norm": 0.2765412044582804, "learning_rate": 0.0008306916724661224, "loss": 3.3111, "step": 597 }, { "epoch": 2.9348093480934807, "grad_norm": 0.3324307050770388, "learning_rate": 0.0008300956172365724, "loss": 3.2831, "step": 598 }, { "epoch": 2.939729397293973, "grad_norm": 0.20744346064209676, "learning_rate": 0.0008294987293103333, "loss": 3.1528, "step": 599 }, { "epoch": 2.944649446494465, "grad_norm": 0.3198228426452386, "learning_rate": 0.0008289010101931105, "loss": 2.7675, "step": 600 }, { "epoch": 2.949569495694957, "grad_norm": 0.29462852796334876, "learning_rate": 0.0008283024613927053, "loss": 2.877, "step": 601 }, { "epoch": 2.954489544895449, "grad_norm": 0.19484916186447043, "learning_rate": 0.0008277030844190127, "loss": 3.6574, "step": 602 }, { "epoch": 2.959409594095941, "grad_norm": 0.4911922190547767, "learning_rate": 0.0008271028807840164, "loss": 3.2822, "step": 603 }, { "epoch": 2.9643296432964332, "grad_norm": 0.15171130725247742, "learning_rate": 0.0008265018520017855, "loss": 3.4304, "step": 604 }, { "epoch": 2.969249692496925, "grad_norm": 0.22862813781202002, "learning_rate": 0.0008258999995884705, "loss": 3.475, "step": 605 }, { "epoch": 2.974169741697417, "grad_norm": 0.4096766585350489, "learning_rate": 0.0008252973250622999, "loss": 3.6092, "step": 606 }, { "epoch": 2.979089790897909, "grad_norm": 0.5545271209641128, "learning_rate": 0.0008246938299435758, "loss": 3.1455, "step": 607 }, { "epoch": 2.984009840098401, "grad_norm": 1.1429649153593875, "learning_rate": 0.0008240895157546702, "loss": 3.5361, "step": 608 }, { "epoch": 2.988929889298893, "grad_norm": 0.29190309723885594, "learning_rate": 0.0008234843840200217, "loss": 3.5149, "step": 609 }, { "epoch": 2.993849938499385, "grad_norm": 0.237461548151454, "learning_rate": 0.0008228784362661311, "loss": 3.2282, "step": 610 }, { "epoch": 2.998769987699877, "grad_norm": 0.5773112413943284, "learning_rate": 0.0008222716740215573, "loss": 3.3359, "step": 611 }, { "epoch": 3.0, "grad_norm": 0.5773112413943284, "learning_rate": 0.0008216640988169141, "loss": 0.6329, "step": 612 }, { "epoch": 3.004920049200492, "grad_norm": 0.37788835940876947, "learning_rate": 0.0008210557121848663, "loss": 3.7655, "step": 613 }, { "epoch": 3.009840098400984, "grad_norm": 0.4791252934343718, "learning_rate": 0.0008204465156601253, "loss": 3.1437, "step": 614 }, { "epoch": 3.014760147601476, "grad_norm": 0.2121186773856055, "learning_rate": 0.0008198365107794457, "loss": 2.9967, "step": 615 }, { "epoch": 3.019680196801968, "grad_norm": 0.4430179358748, "learning_rate": 0.000819225699081621, "loss": 3.1978, "step": 616 }, { "epoch": 3.02460024600246, "grad_norm": 0.38653434570114076, "learning_rate": 0.0008186140821074802, "loss": 3.2753, "step": 617 }, { "epoch": 3.029520295202952, "grad_norm": 0.5347618507193047, "learning_rate": 0.0008180016613998836, "loss": 3.2395, "step": 618 }, { "epoch": 3.034440344403444, "grad_norm": 0.3850480946964441, "learning_rate": 0.0008173884385037192, "loss": 3.2884, "step": 619 }, { "epoch": 3.039360393603936, "grad_norm": 0.2296815323913668, "learning_rate": 0.0008167744149658979, "loss": 2.9691, "step": 620 }, { "epoch": 3.044280442804428, "grad_norm": 0.4436576774507372, "learning_rate": 0.0008161595923353516, "loss": 3.8212, "step": 621 }, { "epoch": 3.0492004920049203, "grad_norm": 0.3203657135226616, "learning_rate": 0.0008155439721630265, "loss": 3.4278, "step": 622 }, { "epoch": 3.054120541205412, "grad_norm": 0.30293259945695367, "learning_rate": 0.0008149275560018815, "loss": 3.5976, "step": 623 }, { "epoch": 3.059040590405904, "grad_norm": 0.41625720893606305, "learning_rate": 0.0008143103454068834, "loss": 3.2448, "step": 624 }, { "epoch": 3.063960639606396, "grad_norm": 0.2809279008876309, "learning_rate": 0.0008136923419350031, "loss": 3.6232, "step": 625 }, { "epoch": 3.068880688806888, "grad_norm": 0.29429479628490013, "learning_rate": 0.0008130735471452111, "loss": 2.9427, "step": 626 }, { "epoch": 3.07380073800738, "grad_norm": 0.17859309765140946, "learning_rate": 0.0008124539625984749, "loss": 3.2219, "step": 627 }, { "epoch": 3.078720787207872, "grad_norm": 0.3381076178958564, "learning_rate": 0.0008118335898577531, "loss": 3.4821, "step": 628 }, { "epoch": 3.0836408364083643, "grad_norm": 0.41230165817524345, "learning_rate": 0.0008112124304879937, "loss": 3.261, "step": 629 }, { "epoch": 3.088560885608856, "grad_norm": 0.1824189052360442, "learning_rate": 0.0008105904860561286, "loss": 3.043, "step": 630 }, { "epoch": 3.093480934809348, "grad_norm": 0.1977886602706406, "learning_rate": 0.00080996775813107, "loss": 3.4017, "step": 631 }, { "epoch": 3.09840098400984, "grad_norm": 0.22679831325906893, "learning_rate": 0.0008093442482837066, "loss": 3.583, "step": 632 }, { "epoch": 3.103321033210332, "grad_norm": 0.33396899694426824, "learning_rate": 0.0008087199580868996, "loss": 2.9858, "step": 633 }, { "epoch": 3.108241082410824, "grad_norm": 0.2965723946988688, "learning_rate": 0.0008080948891154787, "loss": 3.3739, "step": 634 }, { "epoch": 3.113161131611316, "grad_norm": 0.2158857137663753, "learning_rate": 0.000807469042946238, "loss": 2.785, "step": 635 }, { "epoch": 3.1180811808118083, "grad_norm": 0.19588960828697052, "learning_rate": 0.0008068424211579325, "loss": 3.2533, "step": 636 }, { "epoch": 3.1230012300123002, "grad_norm": 0.21126069452609975, "learning_rate": 0.0008062150253312734, "loss": 3.6138, "step": 637 }, { "epoch": 3.127921279212792, "grad_norm": 0.23978057052718416, "learning_rate": 0.0008055868570489246, "loss": 2.5477, "step": 638 }, { "epoch": 3.132841328413284, "grad_norm": 0.1913388239278725, "learning_rate": 0.0008049579178954989, "loss": 2.8084, "step": 639 }, { "epoch": 3.137761377613776, "grad_norm": 0.43288003470590264, "learning_rate": 0.0008043282094575531, "loss": 3.5483, "step": 640 }, { "epoch": 3.142681426814268, "grad_norm": 0.22069963039425997, "learning_rate": 0.0008036977333235848, "loss": 2.9895, "step": 641 }, { "epoch": 3.14760147601476, "grad_norm": 0.310399423410362, "learning_rate": 0.0008030664910840289, "loss": 3.3594, "step": 642 }, { "epoch": 3.1525215252152523, "grad_norm": 0.35945038233580456, "learning_rate": 0.0008024344843312516, "loss": 3.5627, "step": 643 }, { "epoch": 3.1574415744157442, "grad_norm": 0.4574989581018336, "learning_rate": 0.0008018017146595489, "loss": 3.2836, "step": 644 }, { "epoch": 3.162361623616236, "grad_norm": 0.23998724925160553, "learning_rate": 0.0008011681836651401, "loss": 2.7443, "step": 645 }, { "epoch": 3.167281672816728, "grad_norm": 0.8252046319827724, "learning_rate": 0.0008005338929461663, "loss": 3.5864, "step": 646 }, { "epoch": 3.17220172201722, "grad_norm": 0.29300991017650446, "learning_rate": 0.0007998988441026839, "loss": 3.1275, "step": 647 }, { "epoch": 3.177121771217712, "grad_norm": 0.257252742337122, "learning_rate": 0.0007992630387366626, "loss": 3.4553, "step": 648 }, { "epoch": 3.1820418204182044, "grad_norm": 1.090529029396679, "learning_rate": 0.0007986264784519802, "loss": 3.1688, "step": 649 }, { "epoch": 3.1869618696186963, "grad_norm": 0.24692198022868125, "learning_rate": 0.0007979891648544185, "loss": 3.1989, "step": 650 }, { "epoch": 3.1918819188191883, "grad_norm": 0.3038891771883618, "learning_rate": 0.0007973510995516602, "loss": 3.6399, "step": 651 }, { "epoch": 3.19680196801968, "grad_norm": 0.3712604855705859, "learning_rate": 0.000796712284153284, "loss": 3.2941, "step": 652 }, { "epoch": 3.201722017220172, "grad_norm": 0.26239672319471874, "learning_rate": 0.0007960727202707605, "loss": 2.7806, "step": 653 }, { "epoch": 3.206642066420664, "grad_norm": 0.28107610334760075, "learning_rate": 0.000795432409517449, "loss": 3.4021, "step": 654 }, { "epoch": 3.211562115621156, "grad_norm": 0.2638887475673738, "learning_rate": 0.0007947913535085924, "loss": 3.2748, "step": 655 }, { "epoch": 3.2164821648216484, "grad_norm": 0.19624923518235268, "learning_rate": 0.0007941495538613134, "loss": 3.4803, "step": 656 }, { "epoch": 3.2214022140221403, "grad_norm": 0.31104553043655775, "learning_rate": 0.0007935070121946115, "loss": 3.0054, "step": 657 }, { "epoch": 3.2263222632226323, "grad_norm": 0.3106607992981457, "learning_rate": 0.0007928637301293568, "loss": 3.3483, "step": 658 }, { "epoch": 3.231242312423124, "grad_norm": 0.1771211245990364, "learning_rate": 0.0007922197092882881, "loss": 3.6408, "step": 659 }, { "epoch": 3.236162361623616, "grad_norm": 0.2865326712090847, "learning_rate": 0.0007915749512960075, "loss": 3.4306, "step": 660 }, { "epoch": 3.241082410824108, "grad_norm": 0.5164910964763684, "learning_rate": 0.0007909294577789765, "loss": 3.0928, "step": 661 }, { "epoch": 3.2460024600246005, "grad_norm": 0.25732182219565164, "learning_rate": 0.0007902832303655118, "loss": 2.6733, "step": 662 }, { "epoch": 3.2509225092250924, "grad_norm": 0.2442340313862587, "learning_rate": 0.0007896362706857825, "loss": 3.324, "step": 663 }, { "epoch": 3.2558425584255843, "grad_norm": 0.29449589536109544, "learning_rate": 0.0007889885803718036, "loss": 2.9751, "step": 664 }, { "epoch": 3.2607626076260763, "grad_norm": 0.24776099377422725, "learning_rate": 0.0007883401610574337, "loss": 3.1757, "step": 665 }, { "epoch": 3.265682656826568, "grad_norm": 0.22631558942664745, "learning_rate": 0.0007876910143783705, "loss": 2.6641, "step": 666 }, { "epoch": 3.27060270602706, "grad_norm": 0.2718838445446432, "learning_rate": 0.0007870411419721469, "loss": 2.9341, "step": 667 }, { "epoch": 3.275522755227552, "grad_norm": 0.4057292883808594, "learning_rate": 0.0007863905454781254, "loss": 3.4176, "step": 668 }, { "epoch": 3.280442804428044, "grad_norm": 0.3346939636546727, "learning_rate": 0.0007857392265374963, "loss": 3.3237, "step": 669 }, { "epoch": 3.2853628536285364, "grad_norm": 0.2330391877959295, "learning_rate": 0.0007850871867932716, "loss": 3.1194, "step": 670 }, { "epoch": 3.2902829028290284, "grad_norm": 0.26000491455221275, "learning_rate": 0.0007844344278902816, "loss": 3.6461, "step": 671 }, { "epoch": 3.2952029520295203, "grad_norm": 0.19648378047599352, "learning_rate": 0.0007837809514751713, "loss": 2.9315, "step": 672 }, { "epoch": 3.3001230012300122, "grad_norm": 0.28522774607553353, "learning_rate": 0.000783126759196395, "loss": 3.1767, "step": 673 }, { "epoch": 3.305043050430504, "grad_norm": 0.21856803920553167, "learning_rate": 0.0007824718527042134, "loss": 3.4546, "step": 674 }, { "epoch": 3.3099630996309966, "grad_norm": 0.24821788389992874, "learning_rate": 0.0007818162336506885, "loss": 3.6346, "step": 675 }, { "epoch": 3.3148831488314885, "grad_norm": 0.2446982271118237, "learning_rate": 0.00078115990368968, "loss": 3.6471, "step": 676 }, { "epoch": 3.3198031980319804, "grad_norm": 0.2720838886859589, "learning_rate": 0.0007805028644768406, "loss": 3.3082, "step": 677 }, { "epoch": 3.3247232472324724, "grad_norm": 0.30723243334700584, "learning_rate": 0.0007798451176696128, "loss": 3.1099, "step": 678 }, { "epoch": 3.3296432964329643, "grad_norm": 0.2686864776197463, "learning_rate": 0.0007791866649272236, "loss": 3.1548, "step": 679 }, { "epoch": 3.3345633456334562, "grad_norm": 0.1882739044874197, "learning_rate": 0.0007785275079106806, "loss": 3.1286, "step": 680 }, { "epoch": 3.339483394833948, "grad_norm": 0.3448110676623945, "learning_rate": 0.0007778676482827685, "loss": 3.2508, "step": 681 }, { "epoch": 3.34440344403444, "grad_norm": 0.21815308046492157, "learning_rate": 0.0007772070877080443, "loss": 3.5367, "step": 682 }, { "epoch": 3.3493234932349325, "grad_norm": 0.18538905364641078, "learning_rate": 0.0007765458278528325, "loss": 3.4405, "step": 683 }, { "epoch": 3.3542435424354244, "grad_norm": 0.35076647606151723, "learning_rate": 0.000775883870385223, "loss": 2.7709, "step": 684 }, { "epoch": 3.3591635916359164, "grad_norm": 0.4231436175724309, "learning_rate": 0.0007752212169750641, "loss": 3.0174, "step": 685 }, { "epoch": 3.3640836408364083, "grad_norm": 0.25804202091415923, "learning_rate": 0.0007745578692939602, "loss": 3.0929, "step": 686 }, { "epoch": 3.3690036900369003, "grad_norm": 0.23429379156511998, "learning_rate": 0.0007738938290152673, "loss": 3.251, "step": 687 }, { "epoch": 3.373923739237392, "grad_norm": 0.21651554618112834, "learning_rate": 0.0007732290978140885, "loss": 3.2467, "step": 688 }, { "epoch": 3.3788437884378846, "grad_norm": 0.3323541599369112, "learning_rate": 0.0007725636773672692, "loss": 3.6327, "step": 689 }, { "epoch": 3.3837638376383765, "grad_norm": 2.292994989353855, "learning_rate": 0.0007718975693533945, "loss": 3.7434, "step": 690 }, { "epoch": 3.3886838868388685, "grad_norm": 0.3409333097237527, "learning_rate": 0.0007712307754527832, "loss": 3.2932, "step": 691 }, { "epoch": 3.3936039360393604, "grad_norm": 0.3109868889064766, "learning_rate": 0.0007705632973474842, "loss": 3.4763, "step": 692 }, { "epoch": 3.3985239852398523, "grad_norm": 0.3779361348161703, "learning_rate": 0.000769895136721273, "loss": 3.1593, "step": 693 }, { "epoch": 3.4034440344403443, "grad_norm": 0.41765056210194546, "learning_rate": 0.0007692262952596465, "loss": 3.4828, "step": 694 }, { "epoch": 3.408364083640836, "grad_norm": 0.3535140388450835, "learning_rate": 0.000768556774649819, "loss": 3.7298, "step": 695 }, { "epoch": 3.4132841328413286, "grad_norm": 0.3389762841537967, "learning_rate": 0.000767886576580718, "loss": 3.2357, "step": 696 }, { "epoch": 3.4182041820418205, "grad_norm": 0.3719511757531799, "learning_rate": 0.0007672157027429803, "loss": 3.477, "step": 697 }, { "epoch": 3.4231242312423125, "grad_norm": 0.31775622971290224, "learning_rate": 0.0007665441548289468, "loss": 3.4805, "step": 698 }, { "epoch": 3.4280442804428044, "grad_norm": 0.32210276147576766, "learning_rate": 0.0007658719345326595, "loss": 3.135, "step": 699 }, { "epoch": 3.4329643296432963, "grad_norm": 0.178185515814666, "learning_rate": 0.0007651990435498559, "loss": 3.5507, "step": 700 }, { "epoch": 3.4378843788437883, "grad_norm": 0.24124458951994707, "learning_rate": 0.0007645254835779657, "loss": 3.6942, "step": 701 }, { "epoch": 3.4428044280442807, "grad_norm": 0.7614236147182618, "learning_rate": 0.0007638512563161064, "loss": 3.3024, "step": 702 }, { "epoch": 3.4477244772447726, "grad_norm": 0.3476415341947312, "learning_rate": 0.0007631763634650783, "loss": 2.9729, "step": 703 }, { "epoch": 3.4526445264452645, "grad_norm": 0.5353692829822977, "learning_rate": 0.0007625008067273611, "loss": 3.3316, "step": 704 }, { "epoch": 3.4575645756457565, "grad_norm": 0.24276643600465514, "learning_rate": 0.000761824587807109, "loss": 3.434, "step": 705 }, { "epoch": 3.4624846248462484, "grad_norm": 0.2718756955723028, "learning_rate": 0.0007611477084101468, "loss": 3.4187, "step": 706 }, { "epoch": 3.4674046740467404, "grad_norm": 0.21227199685981166, "learning_rate": 0.0007604701702439652, "loss": 3.5193, "step": 707 }, { "epoch": 3.4723247232472323, "grad_norm": 0.2578919405165125, "learning_rate": 0.0007597919750177169, "loss": 3.3485, "step": 708 }, { "epoch": 3.4772447724477242, "grad_norm": 0.23847797257765632, "learning_rate": 0.0007591131244422118, "loss": 2.8645, "step": 709 }, { "epoch": 3.4821648216482166, "grad_norm": 0.24531945818144346, "learning_rate": 0.0007584336202299136, "loss": 3.3301, "step": 710 }, { "epoch": 3.4870848708487086, "grad_norm": 0.2596369704154153, "learning_rate": 0.0007577534640949339, "loss": 3.4205, "step": 711 }, { "epoch": 3.4920049200492005, "grad_norm": 0.1993381294625847, "learning_rate": 0.00075707265775303, "loss": 3.0005, "step": 712 }, { "epoch": 3.4969249692496924, "grad_norm": 0.3559147583739751, "learning_rate": 0.0007563912029215983, "loss": 3.704, "step": 713 }, { "epoch": 3.5018450184501844, "grad_norm": 0.32466166058720597, "learning_rate": 0.0007557091013196716, "loss": 3.8044, "step": 714 }, { "epoch": 3.5067650676506767, "grad_norm": 0.29627131521362454, "learning_rate": 0.0007550263546679147, "loss": 3.3759, "step": 715 }, { "epoch": 3.5116851168511687, "grad_norm": 0.27404141266716264, "learning_rate": 0.0007543429646886185, "loss": 2.868, "step": 716 }, { "epoch": 3.5166051660516606, "grad_norm": 0.3329116377060321, "learning_rate": 0.0007536589331056976, "loss": 3.4846, "step": 717 }, { "epoch": 3.5215252152521526, "grad_norm": 0.27493791355232494, "learning_rate": 0.000752974261644685, "loss": 2.912, "step": 718 }, { "epoch": 3.5264452644526445, "grad_norm": 0.22393449568896998, "learning_rate": 0.0007522889520327274, "loss": 3.0547, "step": 719 }, { "epoch": 3.5313653136531364, "grad_norm": 0.19608160152466103, "learning_rate": 0.0007516030059985819, "loss": 3.0434, "step": 720 }, { "epoch": 3.5362853628536284, "grad_norm": 0.309427412137262, "learning_rate": 0.0007509164252726107, "loss": 3.4475, "step": 721 }, { "epoch": 3.5412054120541203, "grad_norm": 0.16584186218416921, "learning_rate": 0.0007502292115867767, "loss": 3.1881, "step": 722 }, { "epoch": 3.5461254612546127, "grad_norm": 0.16558732255379868, "learning_rate": 0.0007495413666746406, "loss": 3.6232, "step": 723 }, { "epoch": 3.5510455104551046, "grad_norm": 0.18105357137463562, "learning_rate": 0.0007488528922713541, "loss": 2.9271, "step": 724 }, { "epoch": 3.5559655596555966, "grad_norm": 0.3604378028558623, "learning_rate": 0.0007481637901136578, "loss": 3.7366, "step": 725 }, { "epoch": 3.5608856088560885, "grad_norm": 0.2405249422095639, "learning_rate": 0.0007474740619398756, "loss": 3.4604, "step": 726 }, { "epoch": 3.5658056580565805, "grad_norm": 0.19782935735075896, "learning_rate": 0.0007467837094899104, "loss": 3.3189, "step": 727 }, { "epoch": 3.570725707257073, "grad_norm": 0.20146426686686286, "learning_rate": 0.00074609273450524, "loss": 3.4147, "step": 728 }, { "epoch": 3.5756457564575648, "grad_norm": 0.28421054114001476, "learning_rate": 0.0007454011387289127, "loss": 3.5173, "step": 729 }, { "epoch": 3.5805658056580567, "grad_norm": 0.335252120796797, "learning_rate": 0.0007447089239055427, "loss": 3.5375, "step": 730 }, { "epoch": 3.5854858548585486, "grad_norm": 0.7080306338918538, "learning_rate": 0.0007440160917813059, "loss": 3.196, "step": 731 }, { "epoch": 3.5904059040590406, "grad_norm": 0.19181204743021785, "learning_rate": 0.0007433226441039355, "loss": 3.3924, "step": 732 }, { "epoch": 3.5953259532595325, "grad_norm": 0.35229527576778313, "learning_rate": 0.0007426285826227171, "loss": 3.3397, "step": 733 }, { "epoch": 3.6002460024600245, "grad_norm": 0.39774302937930667, "learning_rate": 0.0007419339090884847, "loss": 2.7688, "step": 734 }, { "epoch": 3.6051660516605164, "grad_norm": 0.21945926510423397, "learning_rate": 0.0007412386252536168, "loss": 3.0923, "step": 735 }, { "epoch": 3.6100861008610083, "grad_norm": 0.23866813043517318, "learning_rate": 0.0007405427328720311, "loss": 3.3214, "step": 736 }, { "epoch": 3.6150061500615007, "grad_norm": 0.20971429259769456, "learning_rate": 0.0007398462336991802, "loss": 3.108, "step": 737 }, { "epoch": 3.6199261992619927, "grad_norm": 0.4389391640816075, "learning_rate": 0.0007391491294920479, "loss": 3.5657, "step": 738 }, { "epoch": 3.6248462484624846, "grad_norm": 0.9877839149703718, "learning_rate": 0.0007384514220091437, "loss": 3.6144, "step": 739 }, { "epoch": 3.6297662976629765, "grad_norm": 0.3356613029657157, "learning_rate": 0.0007377531130104992, "loss": 3.1212, "step": 740 }, { "epoch": 3.6346863468634685, "grad_norm": 0.2468298165252035, "learning_rate": 0.0007370542042576634, "loss": 3.4087, "step": 741 }, { "epoch": 3.639606396063961, "grad_norm": 0.3201530220629794, "learning_rate": 0.0007363546975136983, "loss": 3.7264, "step": 742 }, { "epoch": 3.644526445264453, "grad_norm": 0.6289700953655759, "learning_rate": 0.0007356545945431743, "loss": 3.3303, "step": 743 }, { "epoch": 3.6494464944649447, "grad_norm": 0.4648722242583848, "learning_rate": 0.0007349538971121657, "loss": 3.2968, "step": 744 }, { "epoch": 3.6543665436654367, "grad_norm": 0.17941031095318985, "learning_rate": 0.0007342526069882465, "loss": 3.4214, "step": 745 }, { "epoch": 3.6592865928659286, "grad_norm": 0.3080074919918561, "learning_rate": 0.0007335507259404858, "loss": 3.1512, "step": 746 }, { "epoch": 3.6642066420664205, "grad_norm": 0.4792674828701538, "learning_rate": 0.0007328482557394434, "loss": 2.7976, "step": 747 }, { "epoch": 3.6691266912669125, "grad_norm": 0.5020623709256492, "learning_rate": 0.0007321451981571654, "loss": 3.5771, "step": 748 }, { "epoch": 3.6740467404674044, "grad_norm": 0.28299011678554475, "learning_rate": 0.0007314415549671794, "loss": 3.393, "step": 749 }, { "epoch": 3.678966789667897, "grad_norm": 0.5710899724191699, "learning_rate": 0.0007307373279444904, "loss": 3.125, "step": 750 }, { "epoch": 3.6838868388683887, "grad_norm": 0.3044389618309208, "learning_rate": 0.0007300325188655761, "loss": 3.3536, "step": 751 }, { "epoch": 3.6888068880688807, "grad_norm": 0.47476201658170564, "learning_rate": 0.0007293271295083823, "loss": 3.3606, "step": 752 }, { "epoch": 3.6937269372693726, "grad_norm": 0.7374790068061169, "learning_rate": 0.0007286211616523193, "loss": 3.1814, "step": 753 }, { "epoch": 3.6986469864698646, "grad_norm": 2.703447258283296, "learning_rate": 0.0007279146170782558, "loss": 3.1035, "step": 754 }, { "epoch": 3.703567035670357, "grad_norm": 0.4087781171862544, "learning_rate": 0.0007272074975685159, "loss": 2.8169, "step": 755 }, { "epoch": 3.708487084870849, "grad_norm": 0.5359231952256804, "learning_rate": 0.0007264998049068738, "loss": 3.3791, "step": 756 }, { "epoch": 3.713407134071341, "grad_norm": 0.4202176637907857, "learning_rate": 0.0007257915408785498, "loss": 2.8111, "step": 757 }, { "epoch": 3.7183271832718328, "grad_norm": 0.47544063511193096, "learning_rate": 0.0007250827072702054, "loss": 3.1517, "step": 758 }, { "epoch": 3.7232472324723247, "grad_norm": 0.5224793296704612, "learning_rate": 0.0007243733058699386, "loss": 3.268, "step": 759 }, { "epoch": 3.7281672816728166, "grad_norm": 0.420735443128869, "learning_rate": 0.00072366333846728, "loss": 3.4447, "step": 760 }, { "epoch": 3.7330873308733086, "grad_norm": 0.6770895564100234, "learning_rate": 0.0007229528068531881, "loss": 3.6965, "step": 761 }, { "epoch": 3.7380073800738005, "grad_norm": 0.2336107599823673, "learning_rate": 0.0007222417128200445, "loss": 2.7885, "step": 762 }, { "epoch": 3.742927429274293, "grad_norm": 0.2332246241301203, "learning_rate": 0.0007215300581616495, "loss": 3.8205, "step": 763 }, { "epoch": 3.747847478474785, "grad_norm": 0.4203289710097054, "learning_rate": 0.0007208178446732178, "loss": 3.235, "step": 764 }, { "epoch": 3.7527675276752768, "grad_norm": 0.21227055368565595, "learning_rate": 0.0007201050741513735, "loss": 3.738, "step": 765 }, { "epoch": 3.7576875768757687, "grad_norm": 0.16322029775865946, "learning_rate": 0.0007193917483941467, "loss": 3.3344, "step": 766 }, { "epoch": 3.7626076260762606, "grad_norm": 0.6551170414307655, "learning_rate": 0.0007186778692009668, "loss": 2.6873, "step": 767 }, { "epoch": 3.767527675276753, "grad_norm": 0.3467639103462003, "learning_rate": 0.0007179634383726603, "loss": 3.4703, "step": 768 }, { "epoch": 3.772447724477245, "grad_norm": 0.3415566410286678, "learning_rate": 0.0007172484577114451, "loss": 3.3301, "step": 769 }, { "epoch": 3.777367773677737, "grad_norm": 0.24494213275698942, "learning_rate": 0.0007165329290209259, "loss": 2.4782, "step": 770 }, { "epoch": 3.782287822878229, "grad_norm": 0.25728433434706055, "learning_rate": 0.0007158168541060899, "loss": 3.5319, "step": 771 }, { "epoch": 3.787207872078721, "grad_norm": 0.23446195311898277, "learning_rate": 0.000715100234773302, "loss": 3.6178, "step": 772 }, { "epoch": 3.7921279212792127, "grad_norm": 0.3348135102340776, "learning_rate": 0.000714383072830301, "loss": 3.4725, "step": 773 }, { "epoch": 3.7970479704797047, "grad_norm": 0.23856383289656874, "learning_rate": 0.000713665370086194, "loss": 3.7731, "step": 774 }, { "epoch": 3.8019680196801966, "grad_norm": 0.1849662597439374, "learning_rate": 0.0007129471283514525, "loss": 3.4777, "step": 775 }, { "epoch": 3.8068880688806885, "grad_norm": 0.266237743594453, "learning_rate": 0.0007122283494379076, "loss": 3.2428, "step": 776 }, { "epoch": 3.811808118081181, "grad_norm": 0.234698670034024, "learning_rate": 0.0007115090351587454, "loss": 3.2954, "step": 777 }, { "epoch": 3.816728167281673, "grad_norm": 0.31494090431048005, "learning_rate": 0.0007107891873285029, "loss": 3.315, "step": 778 }, { "epoch": 3.821648216482165, "grad_norm": 0.3784358048201398, "learning_rate": 0.0007100688077630628, "loss": 3.6732, "step": 779 }, { "epoch": 3.8265682656826567, "grad_norm": 0.37725624933360574, "learning_rate": 0.0007093478982796486, "loss": 3.5035, "step": 780 }, { "epoch": 3.8314883148831487, "grad_norm": 0.2153929507843546, "learning_rate": 0.0007086264606968215, "loss": 3.6734, "step": 781 }, { "epoch": 3.836408364083641, "grad_norm": 0.2880781865893035, "learning_rate": 0.0007079044968344743, "loss": 3.618, "step": 782 }, { "epoch": 3.841328413284133, "grad_norm": 0.26420051599197053, "learning_rate": 0.0007071820085138275, "loss": 2.9507, "step": 783 }, { "epoch": 3.846248462484625, "grad_norm": 0.27126496117670607, "learning_rate": 0.0007064589975574247, "loss": 3.3986, "step": 784 }, { "epoch": 3.851168511685117, "grad_norm": 0.3180284580794458, "learning_rate": 0.000705735465789128, "loss": 3.1718, "step": 785 }, { "epoch": 3.856088560885609, "grad_norm": 0.28085286177400715, "learning_rate": 0.0007050114150341126, "loss": 3.584, "step": 786 }, { "epoch": 3.8610086100861007, "grad_norm": 0.1924442989678574, "learning_rate": 0.0007042868471188641, "loss": 3.6442, "step": 787 }, { "epoch": 3.8659286592865927, "grad_norm": 0.30340239106991906, "learning_rate": 0.0007035617638711715, "loss": 3.4096, "step": 788 }, { "epoch": 3.8708487084870846, "grad_norm": 0.4284730137602197, "learning_rate": 0.0007028361671201245, "loss": 3.4999, "step": 789 }, { "epoch": 3.875768757687577, "grad_norm": 0.378219360428769, "learning_rate": 0.0007021100586961079, "loss": 3.8302, "step": 790 }, { "epoch": 3.880688806888069, "grad_norm": 0.23317881155754056, "learning_rate": 0.0007013834404307972, "loss": 3.5327, "step": 791 }, { "epoch": 3.885608856088561, "grad_norm": 0.1534170743338541, "learning_rate": 0.0007006563141571541, "loss": 2.9981, "step": 792 }, { "epoch": 3.890528905289053, "grad_norm": 0.23741014718331407, "learning_rate": 0.000699928681709422, "loss": 3.4866, "step": 793 }, { "epoch": 3.8954489544895448, "grad_norm": 0.18968911864259946, "learning_rate": 0.0006992005449231208, "loss": 3.0846, "step": 794 }, { "epoch": 3.900369003690037, "grad_norm": 0.17425216715854394, "learning_rate": 0.0006984719056350429, "loss": 3.3657, "step": 795 }, { "epoch": 3.905289052890529, "grad_norm": 0.34552754702053706, "learning_rate": 0.0006977427656832479, "loss": 3.2559, "step": 796 }, { "epoch": 3.910209102091021, "grad_norm": 0.41045987982190807, "learning_rate": 0.0006970131269070591, "loss": 3.4043, "step": 797 }, { "epoch": 3.915129151291513, "grad_norm": 0.4537091981074169, "learning_rate": 0.0006962829911470574, "loss": 3.555, "step": 798 }, { "epoch": 3.920049200492005, "grad_norm": 0.2552278155458706, "learning_rate": 0.0006955523602450779, "loss": 2.8018, "step": 799 }, { "epoch": 3.924969249692497, "grad_norm": 0.24444665882835004, "learning_rate": 0.0006948212360442048, "loss": 3.6893, "step": 800 }, { "epoch": 3.9298892988929888, "grad_norm": 0.33480070173381116, "learning_rate": 0.0006940896203887659, "loss": 3.4484, "step": 801 }, { "epoch": 3.9348093480934807, "grad_norm": 0.20965218353078027, "learning_rate": 0.0006933575151243294, "loss": 3.3865, "step": 802 }, { "epoch": 3.939729397293973, "grad_norm": 0.2319587418985397, "learning_rate": 0.0006926249220976987, "loss": 3.325, "step": 803 }, { "epoch": 3.944649446494465, "grad_norm": 0.3481585420095062, "learning_rate": 0.0006918918431569074, "loss": 3.1144, "step": 804 }, { "epoch": 3.949569495694957, "grad_norm": 0.2777188359966893, "learning_rate": 0.0006911582801512146, "loss": 2.6559, "step": 805 }, { "epoch": 3.954489544895449, "grad_norm": 0.29892415214937923, "learning_rate": 0.0006904242349311009, "loss": 2.7709, "step": 806 }, { "epoch": 3.959409594095941, "grad_norm": 0.1696935700672524, "learning_rate": 0.0006896897093482629, "loss": 2.9667, "step": 807 }, { "epoch": 3.9643296432964332, "grad_norm": 0.43500498031484497, "learning_rate": 0.0006889547052556097, "loss": 2.935, "step": 808 }, { "epoch": 3.969249692496925, "grad_norm": 0.1638136684275606, "learning_rate": 0.000688219224507257, "loss": 3.1474, "step": 809 }, { "epoch": 3.974169741697417, "grad_norm": 0.19104060075409496, "learning_rate": 0.0006874832689585225, "loss": 2.9755, "step": 810 }, { "epoch": 3.979089790897909, "grad_norm": 0.22764787513767895, "learning_rate": 0.0006867468404659221, "loss": 2.6591, "step": 811 }, { "epoch": 3.984009840098401, "grad_norm": 0.20655520512537037, "learning_rate": 0.0006860099408871652, "loss": 3.3368, "step": 812 }, { "epoch": 3.988929889298893, "grad_norm": 0.16332346985230897, "learning_rate": 0.0006852725720811486, "loss": 3.3711, "step": 813 }, { "epoch": 3.993849938499385, "grad_norm": 0.6790608622979326, "learning_rate": 0.0006845347359079536, "loss": 3.3939, "step": 814 }, { "epoch": 3.998769987699877, "grad_norm": 0.4246621640060049, "learning_rate": 0.0006837964342288398, "loss": 2.812, "step": 815 }, { "epoch": 4.0, "grad_norm": 0.43314869751336227, "learning_rate": 0.0006830576689062413, "loss": 0.8596, "step": 816 }, { "epoch": 4.004920049200492, "grad_norm": 0.24455854939832183, "learning_rate": 0.0006823184418037624, "loss": 3.6981, "step": 817 }, { "epoch": 4.009840098400984, "grad_norm": 0.24664050341220561, "learning_rate": 0.0006815787547861713, "loss": 3.0567, "step": 818 }, { "epoch": 4.014760147601476, "grad_norm": 0.3062388285410535, "learning_rate": 0.0006808386097193968, "loss": 2.8939, "step": 819 }, { "epoch": 4.019680196801968, "grad_norm": 0.2960699996294249, "learning_rate": 0.0006800980084705235, "loss": 3.3172, "step": 820 }, { "epoch": 4.02460024600246, "grad_norm": 0.2921558708483319, "learning_rate": 0.0006793569529077863, "loss": 2.6756, "step": 821 }, { "epoch": 4.029520295202952, "grad_norm": 0.22607169729693916, "learning_rate": 0.0006786154449005664, "loss": 3.504, "step": 822 }, { "epoch": 4.0344403444034445, "grad_norm": 0.20729384715334453, "learning_rate": 0.0006778734863193862, "loss": 3.359, "step": 823 }, { "epoch": 4.039360393603936, "grad_norm": 0.24360164850517163, "learning_rate": 0.0006771310790359046, "loss": 3.4836, "step": 824 }, { "epoch": 4.044280442804428, "grad_norm": 0.2946715667908104, "learning_rate": 0.000676388224922913, "loss": 3.1872, "step": 825 }, { "epoch": 4.04920049200492, "grad_norm": 0.24529932606570565, "learning_rate": 0.0006756449258543291, "loss": 2.9933, "step": 826 }, { "epoch": 4.054120541205412, "grad_norm": 0.2909283678687505, "learning_rate": 0.0006749011837051935, "loss": 3.2512, "step": 827 }, { "epoch": 4.059040590405904, "grad_norm": 0.23435724007118816, "learning_rate": 0.0006741570003516647, "loss": 3.4639, "step": 828 }, { "epoch": 4.063960639606396, "grad_norm": 0.28881314228175603, "learning_rate": 0.0006734123776710137, "loss": 2.893, "step": 829 }, { "epoch": 4.068880688806888, "grad_norm": 0.3445583872757909, "learning_rate": 0.0006726673175416203, "loss": 3.2338, "step": 830 }, { "epoch": 4.07380073800738, "grad_norm": 0.38735153756001317, "learning_rate": 0.0006719218218429673, "loss": 2.9207, "step": 831 }, { "epoch": 4.078720787207872, "grad_norm": 0.26832372563658635, "learning_rate": 0.0006711758924556364, "loss": 3.141, "step": 832 }, { "epoch": 4.083640836408364, "grad_norm": 0.2803052331751055, "learning_rate": 0.0006704295312613037, "loss": 3.73, "step": 833 }, { "epoch": 4.088560885608856, "grad_norm": 0.1681867932652505, "learning_rate": 0.0006696827401427339, "loss": 2.8774, "step": 834 }, { "epoch": 4.093480934809348, "grad_norm": 0.4928925768217112, "learning_rate": 0.0006689355209837769, "loss": 3.7569, "step": 835 }, { "epoch": 4.0984009840098405, "grad_norm": 0.33157144103611097, "learning_rate": 0.0006681878756693618, "loss": 3.7871, "step": 836 }, { "epoch": 4.1033210332103325, "grad_norm": 0.2677887588225307, "learning_rate": 0.0006674398060854931, "loss": 3.6537, "step": 837 }, { "epoch": 4.108241082410824, "grad_norm": 0.2936024746119496, "learning_rate": 0.0006666913141192454, "loss": 3.1944, "step": 838 }, { "epoch": 4.113161131611316, "grad_norm": 0.1843906079727091, "learning_rate": 0.000665942401658759, "loss": 3.1552, "step": 839 }, { "epoch": 4.118081180811808, "grad_norm": 0.1947366338155521, "learning_rate": 0.0006651930705932344, "loss": 3.8835, "step": 840 }, { "epoch": 4.1230012300123, "grad_norm": 0.22127997400391597, "learning_rate": 0.0006644433228129288, "loss": 2.9066, "step": 841 }, { "epoch": 4.127921279212792, "grad_norm": 0.25604823382000214, "learning_rate": 0.0006636931602091499, "loss": 3.6227, "step": 842 }, { "epoch": 4.132841328413284, "grad_norm": 0.29762937207148976, "learning_rate": 0.0006629425846742525, "loss": 3.522, "step": 843 }, { "epoch": 4.137761377613776, "grad_norm": 0.2230086237033379, "learning_rate": 0.0006621915981016327, "loss": 3.6817, "step": 844 }, { "epoch": 4.142681426814268, "grad_norm": 0.20837116908578554, "learning_rate": 0.0006614402023857231, "loss": 3.5172, "step": 845 }, { "epoch": 4.14760147601476, "grad_norm": 0.17326768334634388, "learning_rate": 0.0006606883994219893, "loss": 2.3122, "step": 846 }, { "epoch": 4.152521525215252, "grad_norm": 0.26321351273323096, "learning_rate": 0.0006599361911069234, "loss": 3.2891, "step": 847 }, { "epoch": 4.157441574415744, "grad_norm": 0.7397693303260023, "learning_rate": 0.0006591835793380407, "loss": 3.4149, "step": 848 }, { "epoch": 4.162361623616236, "grad_norm": 0.5004760246156575, "learning_rate": 0.0006584305660138735, "loss": 3.2092, "step": 849 }, { "epoch": 4.167281672816729, "grad_norm": 0.2238963307356851, "learning_rate": 0.0006576771530339676, "loss": 3.3327, "step": 850 }, { "epoch": 4.1722017220172205, "grad_norm": 0.5694839092776535, "learning_rate": 0.0006569233422988771, "loss": 3.2131, "step": 851 }, { "epoch": 4.177121771217712, "grad_norm": 0.25943827527162555, "learning_rate": 0.0006561691357101589, "loss": 3.5401, "step": 852 }, { "epoch": 4.182041820418204, "grad_norm": 0.3321369618763318, "learning_rate": 0.0006554145351703688, "loss": 3.1221, "step": 853 }, { "epoch": 4.186961869618696, "grad_norm": 0.4664233137246361, "learning_rate": 0.0006546595425830569, "loss": 2.8118, "step": 854 }, { "epoch": 4.191881918819188, "grad_norm": 0.5928396105398344, "learning_rate": 0.0006539041598527611, "loss": 2.5304, "step": 855 }, { "epoch": 4.19680196801968, "grad_norm": 0.23279688332161255, "learning_rate": 0.0006531483888850043, "loss": 3.1041, "step": 856 }, { "epoch": 4.201722017220172, "grad_norm": 0.492399116776326, "learning_rate": 0.0006523922315862887, "loss": 3.3636, "step": 857 }, { "epoch": 4.206642066420664, "grad_norm": 0.23770332971601957, "learning_rate": 0.0006516356898640908, "loss": 3.7663, "step": 858 }, { "epoch": 4.211562115621156, "grad_norm": 0.38505109604368404, "learning_rate": 0.0006508787656268573, "loss": 3.0644, "step": 859 }, { "epoch": 4.216482164821648, "grad_norm": 0.5141793411111297, "learning_rate": 0.0006501214607839992, "loss": 3.4336, "step": 860 }, { "epoch": 4.22140221402214, "grad_norm": 0.40335751426660055, "learning_rate": 0.0006493637772458879, "loss": 3.5918, "step": 861 }, { "epoch": 4.226322263222632, "grad_norm": 0.36597454838313204, "learning_rate": 0.0006486057169238503, "loss": 3.5016, "step": 862 }, { "epoch": 4.231242312423125, "grad_norm": 0.3141196684972445, "learning_rate": 0.0006478472817301635, "loss": 2.9344, "step": 863 }, { "epoch": 4.236162361623617, "grad_norm": 0.25729194789230675, "learning_rate": 0.0006470884735780505, "loss": 3.4059, "step": 864 }, { "epoch": 4.2410824108241085, "grad_norm": 0.26102297477951875, "learning_rate": 0.0006463292943816747, "loss": 3.4539, "step": 865 }, { "epoch": 4.2460024600246005, "grad_norm": 0.481216475323451, "learning_rate": 0.0006455697460561358, "loss": 2.8661, "step": 866 }, { "epoch": 4.250922509225092, "grad_norm": 0.26083324742253433, "learning_rate": 0.0006448098305174648, "loss": 3.5659, "step": 867 }, { "epoch": 4.255842558425584, "grad_norm": 0.2271127645438216, "learning_rate": 0.0006440495496826188, "loss": 3.7215, "step": 868 }, { "epoch": 4.260762607626076, "grad_norm": 0.29164170463878786, "learning_rate": 0.0006432889054694764, "loss": 3.3138, "step": 869 }, { "epoch": 4.265682656826568, "grad_norm": 0.2707488623170454, "learning_rate": 0.0006425278997968327, "loss": 3.5575, "step": 870 }, { "epoch": 4.27060270602706, "grad_norm": 0.20619997902600504, "learning_rate": 0.0006417665345843952, "loss": 3.3357, "step": 871 }, { "epoch": 4.275522755227552, "grad_norm": 0.2905137309278766, "learning_rate": 0.0006410048117527778, "loss": 3.5226, "step": 872 }, { "epoch": 4.280442804428044, "grad_norm": 0.2072254279609025, "learning_rate": 0.0006402427332234964, "loss": 3.7397, "step": 873 }, { "epoch": 4.285362853628536, "grad_norm": 0.3408728493818221, "learning_rate": 0.000639480300918965, "loss": 2.7117, "step": 874 }, { "epoch": 4.290282902829028, "grad_norm": 0.826526413226481, "learning_rate": 0.0006387175167624894, "loss": 3.1839, "step": 875 }, { "epoch": 4.29520295202952, "grad_norm": 0.20194415411474795, "learning_rate": 0.0006379543826782628, "loss": 3.4211, "step": 876 }, { "epoch": 4.300123001230013, "grad_norm": 0.23174692633675842, "learning_rate": 0.0006371909005913617, "loss": 3.1476, "step": 877 }, { "epoch": 4.305043050430505, "grad_norm": 0.10777009781409684, "learning_rate": 0.0006364270724277401, "loss": 3.4505, "step": 878 }, { "epoch": 4.3099630996309966, "grad_norm": 0.15357643796762876, "learning_rate": 0.0006356629001142251, "loss": 3.3905, "step": 879 }, { "epoch": 4.3148831488314885, "grad_norm": 0.2666776163491524, "learning_rate": 0.0006348983855785121, "loss": 3.6047, "step": 880 }, { "epoch": 4.31980319803198, "grad_norm": 0.21843258365141685, "learning_rate": 0.0006341335307491596, "loss": 3.4643, "step": 881 }, { "epoch": 4.324723247232472, "grad_norm": 0.24059279133325667, "learning_rate": 0.0006333683375555843, "loss": 3.348, "step": 882 }, { "epoch": 4.329643296432964, "grad_norm": 0.23837657201878779, "learning_rate": 0.000632602807928057, "loss": 3.3701, "step": 883 }, { "epoch": 4.334563345633456, "grad_norm": 0.2144903427201142, "learning_rate": 0.0006318369437976968, "loss": 2.9382, "step": 884 }, { "epoch": 4.339483394833948, "grad_norm": 0.26790467864675854, "learning_rate": 0.0006310707470964667, "loss": 3.0965, "step": 885 }, { "epoch": 4.34440344403444, "grad_norm": 0.2880867331422874, "learning_rate": 0.0006303042197571688, "loss": 3.0901, "step": 886 }, { "epoch": 4.349323493234932, "grad_norm": 0.18636822368582084, "learning_rate": 0.0006295373637134389, "loss": 3.3165, "step": 887 }, { "epoch": 4.354243542435424, "grad_norm": 0.3464919002872146, "learning_rate": 0.0006287701808997424, "loss": 3.4132, "step": 888 }, { "epoch": 4.359163591635916, "grad_norm": 0.36500310044588036, "learning_rate": 0.0006280026732513688, "loss": 3.1347, "step": 889 }, { "epoch": 4.364083640836409, "grad_norm": 0.33808234673748183, "learning_rate": 0.000627234842704427, "loss": 3.7883, "step": 890 }, { "epoch": 4.369003690036901, "grad_norm": 0.293437595139822, "learning_rate": 0.0006264666911958404, "loss": 2.6742, "step": 891 }, { "epoch": 4.373923739237393, "grad_norm": 0.2168214220913398, "learning_rate": 0.0006256982206633423, "loss": 3.1039, "step": 892 }, { "epoch": 4.378843788437885, "grad_norm": 0.16553651657433263, "learning_rate": 0.0006249294330454704, "loss": 3.2481, "step": 893 }, { "epoch": 4.3837638376383765, "grad_norm": 0.23102448405406115, "learning_rate": 0.0006241603302815629, "loss": 3.0445, "step": 894 }, { "epoch": 4.3886838868388685, "grad_norm": 0.21938016885127803, "learning_rate": 0.0006233909143117521, "loss": 3.5145, "step": 895 }, { "epoch": 4.39360393603936, "grad_norm": 0.1803490689822809, "learning_rate": 0.0006226211870769611, "loss": 3.0225, "step": 896 }, { "epoch": 4.398523985239852, "grad_norm": 0.22361264292319008, "learning_rate": 0.000621851150518898, "loss": 3.3788, "step": 897 }, { "epoch": 4.403444034440344, "grad_norm": 0.19766216659733254, "learning_rate": 0.000621080806580051, "loss": 2.6717, "step": 898 }, { "epoch": 4.408364083640836, "grad_norm": 0.2950636896663806, "learning_rate": 0.0006203101572036839, "loss": 3.8371, "step": 899 }, { "epoch": 4.413284132841328, "grad_norm": 0.25551429577300006, "learning_rate": 0.0006195392043338308, "loss": 3.0073, "step": 900 }, { "epoch": 4.41820418204182, "grad_norm": 0.2080327371536312, "learning_rate": 0.0006187679499152919, "loss": 3.4941, "step": 901 }, { "epoch": 4.423124231242312, "grad_norm": 0.21941974958579905, "learning_rate": 0.0006179963958936277, "loss": 3.2262, "step": 902 }, { "epoch": 4.428044280442805, "grad_norm": 0.3398235415134109, "learning_rate": 0.000617224544215154, "loss": 2.6439, "step": 903 }, { "epoch": 4.432964329643297, "grad_norm": 0.40811163559446956, "learning_rate": 0.0006164523968269385, "loss": 3.615, "step": 904 }, { "epoch": 4.437884378843789, "grad_norm": 0.31216718479320654, "learning_rate": 0.000615679955676794, "loss": 3.2553, "step": 905 }, { "epoch": 4.442804428044281, "grad_norm": 0.15654574664599868, "learning_rate": 0.0006149072227132748, "loss": 2.9969, "step": 906 }, { "epoch": 4.447724477244773, "grad_norm": 0.15882605888615095, "learning_rate": 0.0006141341998856711, "loss": 3.0801, "step": 907 }, { "epoch": 4.4526445264452645, "grad_norm": 0.30844678915036616, "learning_rate": 0.0006133608891440045, "loss": 3.4219, "step": 908 }, { "epoch": 4.4575645756457565, "grad_norm": 0.21080044150191468, "learning_rate": 0.0006125872924390226, "loss": 3.448, "step": 909 }, { "epoch": 4.462484624846248, "grad_norm": 0.11870330081600128, "learning_rate": 0.0006118134117221949, "loss": 3.6754, "step": 910 }, { "epoch": 4.46740467404674, "grad_norm": 0.42353732102079483, "learning_rate": 0.0006110392489457066, "loss": 3.0763, "step": 911 }, { "epoch": 4.472324723247232, "grad_norm": 0.2658673839747719, "learning_rate": 0.000610264806062455, "loss": 3.1532, "step": 912 }, { "epoch": 4.477244772447724, "grad_norm": 0.2919508668058704, "learning_rate": 0.0006094900850260439, "loss": 3.5337, "step": 913 }, { "epoch": 4.482164821648216, "grad_norm": 0.17330978967435964, "learning_rate": 0.0006087150877907786, "loss": 3.382, "step": 914 }, { "epoch": 4.487084870848708, "grad_norm": 0.2513238352191471, "learning_rate": 0.0006079398163116611, "loss": 2.9928, "step": 915 }, { "epoch": 4.492004920049201, "grad_norm": 0.3518940501539981, "learning_rate": 0.0006071642725443856, "loss": 3.6598, "step": 916 }, { "epoch": 4.496924969249693, "grad_norm": 0.16537372229724157, "learning_rate": 0.0006063884584453325, "loss": 3.177, "step": 917 }, { "epoch": 4.501845018450185, "grad_norm": 0.7602621034377439, "learning_rate": 0.000605612375971565, "loss": 3.5487, "step": 918 }, { "epoch": 4.506765067650677, "grad_norm": 0.2386325738011221, "learning_rate": 0.0006048360270808225, "loss": 2.888, "step": 919 }, { "epoch": 4.511685116851169, "grad_norm": 0.16137550519778215, "learning_rate": 0.000604059413731517, "loss": 3.1772, "step": 920 }, { "epoch": 4.516605166051661, "grad_norm": 0.34378584933656986, "learning_rate": 0.0006032825378827272, "loss": 3.3707, "step": 921 }, { "epoch": 4.521525215252153, "grad_norm": 0.21350205826077068, "learning_rate": 0.0006025054014941944, "loss": 3.6344, "step": 922 }, { "epoch": 4.5264452644526445, "grad_norm": 0.22287262629470422, "learning_rate": 0.000601728006526317, "loss": 3.137, "step": 923 }, { "epoch": 4.531365313653136, "grad_norm": 0.2558776998460169, "learning_rate": 0.0006009503549401453, "loss": 2.8051, "step": 924 }, { "epoch": 4.536285362853628, "grad_norm": 0.5515472286394457, "learning_rate": 0.0006001724486973774, "loss": 3.1693, "step": 925 }, { "epoch": 4.54120541205412, "grad_norm": 0.198351231631331, "learning_rate": 0.0005993942897603538, "loss": 3.1833, "step": 926 }, { "epoch": 4.546125461254612, "grad_norm": 0.273276908868184, "learning_rate": 0.0005986158800920522, "loss": 3.1254, "step": 927 }, { "epoch": 4.551045510455104, "grad_norm": 0.19023517895440542, "learning_rate": 0.0005978372216560829, "loss": 3.2519, "step": 928 }, { "epoch": 4.555965559655597, "grad_norm": 0.25129375799294834, "learning_rate": 0.0005970583164166837, "loss": 3.4746, "step": 929 }, { "epoch": 4.560885608856088, "grad_norm": 0.2891113001798663, "learning_rate": 0.0005962791663387151, "loss": 3.7117, "step": 930 }, { "epoch": 4.565805658056581, "grad_norm": 0.22061824368781743, "learning_rate": 0.0005954997733876551, "loss": 3.1153, "step": 931 }, { "epoch": 4.570725707257073, "grad_norm": 0.4904261327128776, "learning_rate": 0.0005947201395295944, "loss": 4.0261, "step": 932 }, { "epoch": 4.575645756457565, "grad_norm": 0.19130146824980496, "learning_rate": 0.0005939402667312316, "loss": 3.4305, "step": 933 }, { "epoch": 4.580565805658057, "grad_norm": 0.19472143225133662, "learning_rate": 0.0005931601569598675, "loss": 3.1144, "step": 934 }, { "epoch": 4.585485854858549, "grad_norm": 0.27153220383505966, "learning_rate": 0.0005923798121834016, "loss": 3.7124, "step": 935 }, { "epoch": 4.590405904059041, "grad_norm": 0.2322578948748033, "learning_rate": 0.0005915992343703253, "loss": 2.8301, "step": 936 }, { "epoch": 4.5953259532595325, "grad_norm": 1.4186661168293737, "learning_rate": 0.0005908184254897182, "loss": 3.5589, "step": 937 }, { "epoch": 4.6002460024600245, "grad_norm": 0.26668859116241267, "learning_rate": 0.0005900373875112431, "loss": 3.7922, "step": 938 }, { "epoch": 4.605166051660516, "grad_norm": 0.2480397822431978, "learning_rate": 0.0005892561224051403, "loss": 2.6734, "step": 939 }, { "epoch": 4.610086100861008, "grad_norm": 0.2575839169825418, "learning_rate": 0.0005884746321422233, "loss": 3.6183, "step": 940 }, { "epoch": 4.6150061500615, "grad_norm": 0.7026475529085516, "learning_rate": 0.0005876929186938733, "loss": 3.2153, "step": 941 }, { "epoch": 4.619926199261993, "grad_norm": 1.1564386835421387, "learning_rate": 0.0005869109840320347, "loss": 3.2378, "step": 942 }, { "epoch": 4.624846248462484, "grad_norm": 0.3297779993305543, "learning_rate": 0.0005861288301292103, "loss": 3.6313, "step": 943 }, { "epoch": 4.629766297662977, "grad_norm": 0.41438691768110897, "learning_rate": 0.0005853464589584552, "loss": 3.7099, "step": 944 }, { "epoch": 4.634686346863469, "grad_norm": 0.31364946662098286, "learning_rate": 0.0005845638724933729, "loss": 3.1361, "step": 945 }, { "epoch": 4.639606396063961, "grad_norm": 0.27675902083673254, "learning_rate": 0.0005837810727081102, "loss": 3.5766, "step": 946 }, { "epoch": 4.644526445264453, "grad_norm": 0.3633557474060726, "learning_rate": 0.000582998061577352, "loss": 3.6532, "step": 947 }, { "epoch": 4.649446494464945, "grad_norm": 0.27278532450048937, "learning_rate": 0.0005822148410763161, "loss": 3.69, "step": 948 }, { "epoch": 4.654366543665437, "grad_norm": 0.46059351376689994, "learning_rate": 0.0005814314131807485, "loss": 3.7182, "step": 949 }, { "epoch": 4.659286592865929, "grad_norm": 0.21936770388472196, "learning_rate": 0.0005806477798669185, "loss": 3.597, "step": 950 }, { "epoch": 4.6642066420664205, "grad_norm": 0.4088947034583073, "learning_rate": 0.0005798639431116135, "loss": 3.4222, "step": 951 }, { "epoch": 4.6691266912669125, "grad_norm": 0.26690110043562415, "learning_rate": 0.0005790799048921341, "loss": 3.6115, "step": 952 }, { "epoch": 4.674046740467404, "grad_norm": 0.7741064790082663, "learning_rate": 0.0005782956671862894, "loss": 2.5026, "step": 953 }, { "epoch": 4.678966789667896, "grad_norm": 0.35330606625985045, "learning_rate": 0.0005775112319723912, "loss": 3.1746, "step": 954 }, { "epoch": 4.683886838868388, "grad_norm": 0.1973111474339948, "learning_rate": 0.0005767266012292496, "loss": 3.4522, "step": 955 }, { "epoch": 4.68880688806888, "grad_norm": 0.3248467146378788, "learning_rate": 0.0005759417769361686, "loss": 3.2876, "step": 956 }, { "epoch": 4.693726937269373, "grad_norm": 1.1289024211272845, "learning_rate": 0.0005751567610729398, "loss": 3.0139, "step": 957 }, { "epoch": 4.698646986469865, "grad_norm": 0.8591037537217671, "learning_rate": 0.0005743715556198379, "loss": 3.4909, "step": 958 }, { "epoch": 4.703567035670357, "grad_norm": 0.35773548507790154, "learning_rate": 0.0005735861625576166, "loss": 2.796, "step": 959 }, { "epoch": 4.708487084870849, "grad_norm": 0.19890123480437244, "learning_rate": 0.0005728005838675025, "loss": 2.8786, "step": 960 }, { "epoch": 4.713407134071341, "grad_norm": 0.2024836908773639, "learning_rate": 0.0005720148215311901, "loss": 2.6873, "step": 961 }, { "epoch": 4.718327183271833, "grad_norm": 0.2541843631345393, "learning_rate": 0.0005712288775308377, "loss": 3.3019, "step": 962 }, { "epoch": 4.723247232472325, "grad_norm": 0.2793667102868824, "learning_rate": 0.0005704427538490616, "loss": 2.8424, "step": 963 }, { "epoch": 4.728167281672817, "grad_norm": 0.377606965910954, "learning_rate": 0.0005696564524689312, "loss": 2.9258, "step": 964 }, { "epoch": 4.733087330873309, "grad_norm": 0.2060266157517, "learning_rate": 0.0005688699753739649, "loss": 3.4335, "step": 965 }, { "epoch": 4.7380073800738005, "grad_norm": 0.19763845877873396, "learning_rate": 0.0005680833245481234, "loss": 3.7551, "step": 966 }, { "epoch": 4.7429274292742925, "grad_norm": 0.15137567163387103, "learning_rate": 0.0005672965019758061, "loss": 2.7106, "step": 967 }, { "epoch": 4.747847478474784, "grad_norm": 0.18419804622238098, "learning_rate": 0.0005665095096418456, "loss": 3.1839, "step": 968 }, { "epoch": 4.752767527675276, "grad_norm": 0.5689801543078239, "learning_rate": 0.0005657223495315031, "loss": 2.2966, "step": 969 }, { "epoch": 4.757687576875769, "grad_norm": 0.25586532506228826, "learning_rate": 0.0005649350236304622, "loss": 3.4059, "step": 970 }, { "epoch": 4.762607626076261, "grad_norm": 0.6928330523572424, "learning_rate": 0.0005641475339248257, "loss": 2.8973, "step": 971 }, { "epoch": 4.767527675276753, "grad_norm": 1.3329971648586518, "learning_rate": 0.0005633598824011087, "loss": 2.2126, "step": 972 }, { "epoch": 4.772447724477245, "grad_norm": 0.3207005664621943, "learning_rate": 0.0005625720710462352, "loss": 2.8447, "step": 973 }, { "epoch": 4.777367773677737, "grad_norm": 0.21838914586848798, "learning_rate": 0.0005617841018475322, "loss": 3.1378, "step": 974 }, { "epoch": 4.782287822878229, "grad_norm": 0.30652626107663794, "learning_rate": 0.0005609959767927246, "loss": 3.1472, "step": 975 }, { "epoch": 4.787207872078721, "grad_norm": 0.254947240131906, "learning_rate": 0.0005602076978699305, "loss": 3.6667, "step": 976 }, { "epoch": 4.792127921279213, "grad_norm": 0.508402070414208, "learning_rate": 0.0005594192670676567, "loss": 3.4144, "step": 977 }, { "epoch": 4.797047970479705, "grad_norm": 0.3788077486655659, "learning_rate": 0.0005586306863747927, "loss": 3.1539, "step": 978 }, { "epoch": 4.801968019680197, "grad_norm": 0.2742350602465622, "learning_rate": 0.0005578419577806058, "loss": 2.9902, "step": 979 }, { "epoch": 4.8068880688806885, "grad_norm": 0.3108117408700432, "learning_rate": 0.000557053083274737, "loss": 3.4602, "step": 980 }, { "epoch": 4.8118081180811805, "grad_norm": 0.35734311045809203, "learning_rate": 0.0005562640648471951, "loss": 3.7482, "step": 981 }, { "epoch": 4.816728167281672, "grad_norm": 0.2926459295998522, "learning_rate": 0.0005554749044883518, "loss": 3.4939, "step": 982 }, { "epoch": 4.821648216482165, "grad_norm": 0.2634044916425964, "learning_rate": 0.0005546856041889373, "loss": 3.2829, "step": 983 }, { "epoch": 4.826568265682657, "grad_norm": 0.33707183577355626, "learning_rate": 0.0005538961659400342, "loss": 3.4888, "step": 984 }, { "epoch": 4.831488314883149, "grad_norm": 0.25319005257470584, "learning_rate": 0.0005531065917330736, "loss": 3.4498, "step": 985 }, { "epoch": 4.836408364083641, "grad_norm": 0.3294378023413946, "learning_rate": 0.0005523168835598294, "loss": 3.3065, "step": 986 }, { "epoch": 4.841328413284133, "grad_norm": 0.210652872077263, "learning_rate": 0.0005515270434124135, "loss": 3.6081, "step": 987 }, { "epoch": 4.846248462484625, "grad_norm": 0.5077616942175747, "learning_rate": 0.0005507370732832703, "loss": 2.9492, "step": 988 }, { "epoch": 4.851168511685117, "grad_norm": 0.2276492384292383, "learning_rate": 0.0005499469751651727, "loss": 3.4195, "step": 989 }, { "epoch": 4.856088560885609, "grad_norm": 0.38493294460701694, "learning_rate": 0.0005491567510512163, "loss": 3.1808, "step": 990 }, { "epoch": 4.861008610086101, "grad_norm": 0.21046692909690345, "learning_rate": 0.0005483664029348141, "loss": 3.1167, "step": 991 }, { "epoch": 4.865928659286593, "grad_norm": 0.26125330997686136, "learning_rate": 0.0005475759328096925, "loss": 2.8199, "step": 992 }, { "epoch": 4.870848708487085, "grad_norm": 0.2755750235089007, "learning_rate": 0.0005467853426698852, "loss": 2.8031, "step": 993 }, { "epoch": 4.875768757687577, "grad_norm": 0.3703518653730955, "learning_rate": 0.0005459946345097289, "loss": 2.9239, "step": 994 }, { "epoch": 4.8806888068880685, "grad_norm": 0.16676155398121978, "learning_rate": 0.0005452038103238582, "loss": 3.2098, "step": 995 }, { "epoch": 4.885608856088561, "grad_norm": 0.36079751076783323, "learning_rate": 0.0005444128721072, "loss": 2.4729, "step": 996 }, { "epoch": 4.890528905289053, "grad_norm": 0.25389816361458445, "learning_rate": 0.000543621821854969, "loss": 3.2088, "step": 997 }, { "epoch": 4.895448954489545, "grad_norm": 0.4546580710027096, "learning_rate": 0.0005428306615626626, "loss": 3.1653, "step": 998 }, { "epoch": 4.900369003690037, "grad_norm": 0.4272649376097271, "learning_rate": 0.0005420393932260557, "loss": 3.0864, "step": 999 }, { "epoch": 4.905289052890529, "grad_norm": 0.20450624726287603, "learning_rate": 0.0005412480188411957, "loss": 2.9734, "step": 1000 }, { "epoch": 4.910209102091021, "grad_norm": 0.2592858582595342, "learning_rate": 0.0005404565404043977, "loss": 3.2027, "step": 1001 }, { "epoch": 4.915129151291513, "grad_norm": 0.2001464428754863, "learning_rate": 0.0005396649599122392, "loss": 2.3549, "step": 1002 }, { "epoch": 4.920049200492005, "grad_norm": 0.38056705660425594, "learning_rate": 0.0005388732793615551, "loss": 3.1626, "step": 1003 }, { "epoch": 4.924969249692497, "grad_norm": 0.2911259825316542, "learning_rate": 0.0005380815007494326, "loss": 3.7781, "step": 1004 }, { "epoch": 4.929889298892989, "grad_norm": 0.4745768670854393, "learning_rate": 0.0005372896260732066, "loss": 2.9655, "step": 1005 }, { "epoch": 4.934809348093481, "grad_norm": 0.24873474025488274, "learning_rate": 0.0005364976573304538, "loss": 3.5016, "step": 1006 }, { "epoch": 4.939729397293973, "grad_norm": 0.22015258168343568, "learning_rate": 0.0005357055965189888, "loss": 2.4653, "step": 1007 }, { "epoch": 4.944649446494465, "grad_norm": 0.272474317788895, "learning_rate": 0.0005349134456368583, "loss": 2.9591, "step": 1008 }, { "epoch": 4.949569495694957, "grad_norm": 0.28052795553127385, "learning_rate": 0.0005341212066823356, "loss": 3.6297, "step": 1009 }, { "epoch": 4.9544895448954485, "grad_norm": 0.2273670036487688, "learning_rate": 0.0005333288816539169, "loss": 2.8003, "step": 1010 }, { "epoch": 4.959409594095941, "grad_norm": 0.1812866382794013, "learning_rate": 0.0005325364725503154, "loss": 3.0822, "step": 1011 }, { "epoch": 4.964329643296433, "grad_norm": 0.1909313596431026, "learning_rate": 0.0005317439813704562, "loss": 2.9224, "step": 1012 }, { "epoch": 4.969249692496925, "grad_norm": 0.3304245577449288, "learning_rate": 0.0005309514101134715, "loss": 3.2228, "step": 1013 }, { "epoch": 4.974169741697417, "grad_norm": 0.2587689309911205, "learning_rate": 0.0005301587607786954, "loss": 3.5304, "step": 1014 }, { "epoch": 4.979089790897909, "grad_norm": 0.19425107439550154, "learning_rate": 0.000529366035365659, "loss": 3.3911, "step": 1015 }, { "epoch": 4.984009840098401, "grad_norm": 0.1729790317598892, "learning_rate": 0.000528573235874086, "loss": 3.5636, "step": 1016 }, { "epoch": 4.988929889298893, "grad_norm": 0.2667123423517103, "learning_rate": 0.0005277803643038855, "loss": 3.1571, "step": 1017 }, { "epoch": 4.993849938499385, "grad_norm": 0.14458755749103033, "learning_rate": 0.0005269874226551497, "loss": 3.7004, "step": 1018 }, { "epoch": 4.998769987699877, "grad_norm": 0.17352711295021278, "learning_rate": 0.0005261944129281474, "loss": 3.6337, "step": 1019 }, { "epoch": 5.0, "grad_norm": 0.17352711295021278, "learning_rate": 0.0005254013371233185, "loss": 0.8688, "step": 1020 }, { "epoch": 5.004920049200492, "grad_norm": 0.2962394404750319, "learning_rate": 0.0005246081972412702, "loss": 3.0094, "step": 1021 }, { "epoch": 5.009840098400984, "grad_norm": 0.14499371150721962, "learning_rate": 0.000523814995282771, "loss": 3.5274, "step": 1022 }, { "epoch": 5.014760147601476, "grad_norm": 0.19038549467638213, "learning_rate": 0.0005230217332487462, "loss": 3.3871, "step": 1023 }, { "epoch": 5.019680196801968, "grad_norm": 0.22813195033567563, "learning_rate": 0.0005222284131402722, "loss": 3.805, "step": 1024 }, { "epoch": 5.02460024600246, "grad_norm": 0.35874430576456673, "learning_rate": 0.0005214350369585731, "loss": 2.1572, "step": 1025 }, { "epoch": 5.029520295202952, "grad_norm": 0.28679383010199094, "learning_rate": 0.0005206416067050128, "loss": 3.0864, "step": 1026 }, { "epoch": 5.0344403444034445, "grad_norm": 0.16958374209770172, "learning_rate": 0.0005198481243810927, "loss": 3.0029, "step": 1027 }, { "epoch": 5.039360393603936, "grad_norm": 1.3533687956871538, "learning_rate": 0.0005190545919884453, "loss": 2.7758, "step": 1028 }, { "epoch": 5.044280442804428, "grad_norm": 0.30937205800779805, "learning_rate": 0.0005182610115288295, "loss": 3.5063, "step": 1029 }, { "epoch": 5.04920049200492, "grad_norm": 0.39178015668052624, "learning_rate": 0.0005174673850041249, "loss": 3.3888, "step": 1030 }, { "epoch": 5.054120541205412, "grad_norm": 0.314695423771103, "learning_rate": 0.0005166737144163282, "loss": 3.0244, "step": 1031 }, { "epoch": 5.059040590405904, "grad_norm": 0.3193124740630412, "learning_rate": 0.0005158800017675465, "loss": 3.2911, "step": 1032 }, { "epoch": 5.063960639606396, "grad_norm": 0.2873420001481325, "learning_rate": 0.0005150862490599933, "loss": 2.9693, "step": 1033 }, { "epoch": 5.068880688806888, "grad_norm": 0.3033584473752028, "learning_rate": 0.0005142924582959833, "loss": 3.5192, "step": 1034 }, { "epoch": 5.07380073800738, "grad_norm": 0.26014414311815026, "learning_rate": 0.0005134986314779268, "loss": 3.3665, "step": 1035 }, { "epoch": 5.078720787207872, "grad_norm": 0.23480596527661207, "learning_rate": 0.0005127047706083253, "loss": 3.2685, "step": 1036 }, { "epoch": 5.083640836408364, "grad_norm": 0.18483044054573172, "learning_rate": 0.0005119108776897664, "loss": 3.2402, "step": 1037 }, { "epoch": 5.088560885608856, "grad_norm": 0.29292250459110375, "learning_rate": 0.0005111169547249183, "loss": 2.7994, "step": 1038 }, { "epoch": 5.093480934809348, "grad_norm": 0.3754409759913618, "learning_rate": 0.0005103230037165247, "loss": 3.4253, "step": 1039 }, { "epoch": 5.0984009840098405, "grad_norm": 0.28227500674473666, "learning_rate": 0.0005095290266674006, "loss": 3.2041, "step": 1040 }, { "epoch": 5.1033210332103325, "grad_norm": 0.3462217508427945, "learning_rate": 0.0005087350255804266, "loss": 3.3448, "step": 1041 }, { "epoch": 5.108241082410824, "grad_norm": 0.4157166468096812, "learning_rate": 0.0005079410024585436, "loss": 3.1203, "step": 1042 }, { "epoch": 5.113161131611316, "grad_norm": 0.3286869708217864, "learning_rate": 0.0005071469593047481, "loss": 3.2858, "step": 1043 }, { "epoch": 5.118081180811808, "grad_norm": 0.21953138206482675, "learning_rate": 0.0005063528981220877, "loss": 2.8716, "step": 1044 }, { "epoch": 5.1230012300123, "grad_norm": 0.2435110482643221, "learning_rate": 0.0005055588209136548, "loss": 3.4996, "step": 1045 }, { "epoch": 5.127921279212792, "grad_norm": 0.25295227424298405, "learning_rate": 0.0005047647296825828, "loss": 3.2975, "step": 1046 }, { "epoch": 5.132841328413284, "grad_norm": 0.20726897020554094, "learning_rate": 0.00050397062643204, "loss": 2.9252, "step": 1047 }, { "epoch": 5.137761377613776, "grad_norm": 0.21468336351247358, "learning_rate": 0.000503176513165225, "loss": 2.7149, "step": 1048 }, { "epoch": 5.142681426814268, "grad_norm": 0.3139005273171625, "learning_rate": 0.0005023823918853622, "loss": 3.0821, "step": 1049 }, { "epoch": 5.14760147601476, "grad_norm": 0.209463334604935, "learning_rate": 0.0005015882645956957, "loss": 2.9009, "step": 1050 }, { "epoch": 5.152521525215252, "grad_norm": 0.1984242667737848, "learning_rate": 0.0005007941332994852, "loss": 3.0531, "step": 1051 }, { "epoch": 5.157441574415744, "grad_norm": 0.19197282646628192, "learning_rate": 0.0005, "loss": 3.0686, "step": 1052 }, { "epoch": 5.162361623616236, "grad_norm": 0.309911300732424, "learning_rate": 0.0004992058667005149, "loss": 3.2726, "step": 1053 }, { "epoch": 5.167281672816729, "grad_norm": 0.24959817757654323, "learning_rate": 0.0004984117354043043, "loss": 2.7, "step": 1054 }, { "epoch": 5.1722017220172205, "grad_norm": 0.4224444603421756, "learning_rate": 0.0004976176081146379, "loss": 3.5027, "step": 1055 }, { "epoch": 5.177121771217712, "grad_norm": 0.3887771626074154, "learning_rate": 0.000496823486834775, "loss": 3.4146, "step": 1056 }, { "epoch": 5.182041820418204, "grad_norm": 0.3034736783365048, "learning_rate": 0.0004960293735679601, "loss": 3.3875, "step": 1057 }, { "epoch": 5.186961869618696, "grad_norm": 0.22601927858637405, "learning_rate": 0.0004952352703174172, "loss": 3.1079, "step": 1058 }, { "epoch": 5.191881918819188, "grad_norm": 0.27724014279942594, "learning_rate": 0.0004944411790863452, "loss": 3.4797, "step": 1059 }, { "epoch": 5.19680196801968, "grad_norm": 0.35192289514958963, "learning_rate": 0.0004936471018779124, "loss": 3.056, "step": 1060 }, { "epoch": 5.201722017220172, "grad_norm": 0.31681268113247046, "learning_rate": 0.000492853040695252, "loss": 3.3397, "step": 1061 }, { "epoch": 5.206642066420664, "grad_norm": 0.23298631123153857, "learning_rate": 0.0004920589975414565, "loss": 3.3545, "step": 1062 }, { "epoch": 5.211562115621156, "grad_norm": 0.6584288376737125, "learning_rate": 0.0004912649744195735, "loss": 3.3623, "step": 1063 }, { "epoch": 5.216482164821648, "grad_norm": 0.2548792733792279, "learning_rate": 0.0004904709733325994, "loss": 2.8157, "step": 1064 }, { "epoch": 5.22140221402214, "grad_norm": 0.224487677000855, "learning_rate": 0.0004896769962834754, "loss": 2.9646, "step": 1065 }, { "epoch": 5.226322263222632, "grad_norm": 0.17070655538411528, "learning_rate": 0.0004888830452750818, "loss": 2.8706, "step": 1066 }, { "epoch": 5.231242312423125, "grad_norm": 0.21780035814717755, "learning_rate": 0.00048808912231023365, "loss": 3.344, "step": 1067 }, { "epoch": 5.236162361623617, "grad_norm": 0.3682723024622698, "learning_rate": 0.00048729522939167466, "loss": 3.6147, "step": 1068 }, { "epoch": 5.2410824108241085, "grad_norm": 0.23689605432406216, "learning_rate": 0.0004865013685220733, "loss": 3.3542, "step": 1069 }, { "epoch": 5.2460024600246005, "grad_norm": 0.34117407475191674, "learning_rate": 0.0004857075417040168, "loss": 2.9817, "step": 1070 }, { "epoch": 5.250922509225092, "grad_norm": 0.4640956476799369, "learning_rate": 0.00048491375094000675, "loss": 3.2969, "step": 1071 }, { "epoch": 5.255842558425584, "grad_norm": 0.2774204717605162, "learning_rate": 0.00048411999823245365, "loss": 3.2799, "step": 1072 }, { "epoch": 5.260762607626076, "grad_norm": 0.40359213516815445, "learning_rate": 0.00048332628558367196, "loss": 3.4307, "step": 1073 }, { "epoch": 5.265682656826568, "grad_norm": 0.6740728916153841, "learning_rate": 0.00048253261499587495, "loss": 3.8202, "step": 1074 }, { "epoch": 5.27060270602706, "grad_norm": 0.3581706575141789, "learning_rate": 0.00048173898847117053, "loss": 3.4354, "step": 1075 }, { "epoch": 5.275522755227552, "grad_norm": 0.24468396816031315, "learning_rate": 0.00048094540801155476, "loss": 3.3819, "step": 1076 }, { "epoch": 5.280442804428044, "grad_norm": 0.272766900273804, "learning_rate": 0.0004801518756189074, "loss": 2.7752, "step": 1077 }, { "epoch": 5.285362853628536, "grad_norm": 0.35522119155131354, "learning_rate": 0.0004793583932949874, "loss": 3.3469, "step": 1078 }, { "epoch": 5.290282902829028, "grad_norm": 0.321433214968228, "learning_rate": 0.00047856496304142716, "loss": 3.0755, "step": 1079 }, { "epoch": 5.29520295202952, "grad_norm": 0.3190324343790979, "learning_rate": 0.0004777715868597277, "loss": 3.2588, "step": 1080 }, { "epoch": 5.300123001230013, "grad_norm": 0.3765131747845392, "learning_rate": 0.000476978266751254, "loss": 3.8195, "step": 1081 }, { "epoch": 5.305043050430505, "grad_norm": 0.26862809400389126, "learning_rate": 0.00047618500471722917, "loss": 3.0785, "step": 1082 }, { "epoch": 5.3099630996309966, "grad_norm": 0.29461926708483066, "learning_rate": 0.00047539180275872996, "loss": 3.1142, "step": 1083 }, { "epoch": 5.3148831488314885, "grad_norm": 0.3642349498068616, "learning_rate": 0.00047459866287668156, "loss": 2.7035, "step": 1084 }, { "epoch": 5.31980319803198, "grad_norm": 0.27031450665095663, "learning_rate": 0.0004738055870718528, "loss": 3.0065, "step": 1085 }, { "epoch": 5.324723247232472, "grad_norm": 0.24548549552402524, "learning_rate": 0.00047301257734485013, "loss": 3.2419, "step": 1086 }, { "epoch": 5.329643296432964, "grad_norm": 0.3765611115266949, "learning_rate": 0.00047221963569611447, "loss": 3.2953, "step": 1087 }, { "epoch": 5.334563345633456, "grad_norm": 0.31601904493626093, "learning_rate": 0.0004714267641259142, "loss": 2.7883, "step": 1088 }, { "epoch": 5.339483394833948, "grad_norm": 0.268965624256117, "learning_rate": 0.00047063396463434097, "loss": 3.4045, "step": 1089 }, { "epoch": 5.34440344403444, "grad_norm": 0.8530083620461527, "learning_rate": 0.0004698412392213048, "loss": 3.1774, "step": 1090 }, { "epoch": 5.349323493234932, "grad_norm": 0.8171891648355962, "learning_rate": 0.00046904858988652874, "loss": 3.1978, "step": 1091 }, { "epoch": 5.354243542435424, "grad_norm": 0.4631900855766889, "learning_rate": 0.0004682560186295439, "loss": 3.0459, "step": 1092 }, { "epoch": 5.359163591635916, "grad_norm": 0.44439142673915893, "learning_rate": 0.00046746352744968467, "loss": 2.9565, "step": 1093 }, { "epoch": 5.364083640836409, "grad_norm": 0.3486414784462835, "learning_rate": 0.00046667111834608314, "loss": 3.3075, "step": 1094 }, { "epoch": 5.369003690036901, "grad_norm": 0.3961881921389727, "learning_rate": 0.00046587879331766457, "loss": 3.2734, "step": 1095 }, { "epoch": 5.373923739237393, "grad_norm": 0.2430726592675362, "learning_rate": 0.00046508655436314195, "loss": 2.8911, "step": 1096 }, { "epoch": 5.378843788437885, "grad_norm": 0.2669888343446182, "learning_rate": 0.00046429440348101127, "loss": 2.771, "step": 1097 }, { "epoch": 5.3837638376383765, "grad_norm": 0.3689051235315334, "learning_rate": 0.00046350234266954624, "loss": 3.4629, "step": 1098 }, { "epoch": 5.3886838868388685, "grad_norm": 0.3110821565285645, "learning_rate": 0.0004627103739267936, "loss": 3.5649, "step": 1099 }, { "epoch": 5.39360393603936, "grad_norm": 1.1264826883832577, "learning_rate": 0.00046191849925056745, "loss": 3.4072, "step": 1100 }, { "epoch": 5.398523985239852, "grad_norm": 0.4795229137707865, "learning_rate": 0.000461126720638445, "loss": 3.2878, "step": 1101 }, { "epoch": 5.403444034440344, "grad_norm": 0.39865860012818893, "learning_rate": 0.00046033504008776094, "loss": 3.4552, "step": 1102 }, { "epoch": 5.408364083640836, "grad_norm": 0.908204079308959, "learning_rate": 0.0004595434595956024, "loss": 3.1437, "step": 1103 }, { "epoch": 5.413284132841328, "grad_norm": 0.3582094440979869, "learning_rate": 0.00045875198115880424, "loss": 3.3155, "step": 1104 }, { "epoch": 5.41820418204182, "grad_norm": 0.7000734308957791, "learning_rate": 0.0004579606067739444, "loss": 3.2296, "step": 1105 }, { "epoch": 5.423124231242312, "grad_norm": 0.7670662589628634, "learning_rate": 0.00045716933843733756, "loss": 3.6902, "step": 1106 }, { "epoch": 5.428044280442805, "grad_norm": 0.2527116381315478, "learning_rate": 0.00045637817814503113, "loss": 3.4157, "step": 1107 }, { "epoch": 5.432964329643297, "grad_norm": 0.3325570176581546, "learning_rate": 0.0004555871278928001, "loss": 3.4337, "step": 1108 }, { "epoch": 5.437884378843789, "grad_norm": 0.39448311052234236, "learning_rate": 0.0004547961896761419, "loss": 3.1261, "step": 1109 }, { "epoch": 5.442804428044281, "grad_norm": 0.40606950914280715, "learning_rate": 0.000454005365490271, "loss": 3.6945, "step": 1110 }, { "epoch": 5.447724477244773, "grad_norm": 0.35547104162609905, "learning_rate": 0.0004532146573301149, "loss": 3.0137, "step": 1111 }, { "epoch": 5.4526445264452645, "grad_norm": 0.4442866811831995, "learning_rate": 0.0004524240671903076, "loss": 3.1121, "step": 1112 }, { "epoch": 5.4575645756457565, "grad_norm": 0.3159421398734435, "learning_rate": 0.000451633597065186, "loss": 3.2962, "step": 1113 }, { "epoch": 5.462484624846248, "grad_norm": 0.29046274103955716, "learning_rate": 0.0004508432489487838, "loss": 3.5261, "step": 1114 }, { "epoch": 5.46740467404674, "grad_norm": 0.6056555449197536, "learning_rate": 0.00045005302483482735, "loss": 3.2011, "step": 1115 }, { "epoch": 5.472324723247232, "grad_norm": 0.5314156308833473, "learning_rate": 0.0004492629267167296, "loss": 3.3985, "step": 1116 }, { "epoch": 5.477244772447724, "grad_norm": 0.5059870322085591, "learning_rate": 0.00044847295658758654, "loss": 3.0023, "step": 1117 }, { "epoch": 5.482164821648216, "grad_norm": 0.3145830957769309, "learning_rate": 0.00044768311644017065, "loss": 3.2216, "step": 1118 }, { "epoch": 5.487084870848708, "grad_norm": 0.5041649706792797, "learning_rate": 0.0004468934082669264, "loss": 2.7671, "step": 1119 }, { "epoch": 5.492004920049201, "grad_norm": 0.7355145247289282, "learning_rate": 0.0004461038340599659, "loss": 3.3608, "step": 1120 }, { "epoch": 5.496924969249693, "grad_norm": 0.42706233847981523, "learning_rate": 0.00044531439581106295, "loss": 3.3593, "step": 1121 }, { "epoch": 5.501845018450185, "grad_norm": 0.36559990443125007, "learning_rate": 0.0004445250955116482, "loss": 3.3154, "step": 1122 }, { "epoch": 5.506765067650677, "grad_norm": 0.28239147126997055, "learning_rate": 0.000443735935152805, "loss": 2.989, "step": 1123 }, { "epoch": 5.511685116851169, "grad_norm": 0.5287248556281361, "learning_rate": 0.0004429469167252631, "loss": 3.0165, "step": 1124 }, { "epoch": 5.516605166051661, "grad_norm": 0.9633126841601974, "learning_rate": 0.0004421580422193943, "loss": 3.2265, "step": 1125 }, { "epoch": 5.521525215252153, "grad_norm": 0.3575702039577438, "learning_rate": 0.0004413693136252075, "loss": 3.3586, "step": 1126 }, { "epoch": 5.5264452644526445, "grad_norm": 0.28349954057641247, "learning_rate": 0.00044058073293234333, "loss": 3.0191, "step": 1127 }, { "epoch": 5.531365313653136, "grad_norm": 0.616980108991166, "learning_rate": 0.00043979230213006944, "loss": 3.2254, "step": 1128 }, { "epoch": 5.536285362853628, "grad_norm": 0.4205141342103716, "learning_rate": 0.0004390040232072755, "loss": 2.5573, "step": 1129 }, { "epoch": 5.54120541205412, "grad_norm": 0.34479206270478124, "learning_rate": 0.00043821589815246786, "loss": 2.7997, "step": 1130 }, { "epoch": 5.546125461254612, "grad_norm": 0.4054777887094137, "learning_rate": 0.00043742792895376493, "loss": 3.3303, "step": 1131 }, { "epoch": 5.551045510455104, "grad_norm": 0.4043884193694926, "learning_rate": 0.0004366401175988914, "loss": 3.13, "step": 1132 }, { "epoch": 5.555965559655597, "grad_norm": 0.41921239876972544, "learning_rate": 0.0004358524660751746, "loss": 3.4357, "step": 1133 }, { "epoch": 5.560885608856088, "grad_norm": 0.6096274840786755, "learning_rate": 0.0004350649763695377, "loss": 2.6377, "step": 1134 }, { "epoch": 5.565805658056581, "grad_norm": 0.5840817941003188, "learning_rate": 0.0004342776504684971, "loss": 3.2207, "step": 1135 }, { "epoch": 5.570725707257073, "grad_norm": 0.6381155532722982, "learning_rate": 0.00043349049035815444, "loss": 3.3081, "step": 1136 }, { "epoch": 5.575645756457565, "grad_norm": 0.6484171913707083, "learning_rate": 0.0004327034980241941, "loss": 3.2623, "step": 1137 }, { "epoch": 5.580565805658057, "grad_norm": 0.28729605242058576, "learning_rate": 0.0004319166754518768, "loss": 3.2074, "step": 1138 }, { "epoch": 5.585485854858549, "grad_norm": 0.4791649205984591, "learning_rate": 0.0004311300246260352, "loss": 2.8985, "step": 1139 }, { "epoch": 5.590405904059041, "grad_norm": 0.5875432406241601, "learning_rate": 0.00043034354753106863, "loss": 3.1154, "step": 1140 }, { "epoch": 5.5953259532595325, "grad_norm": 0.8115174930818398, "learning_rate": 0.00042955724615093845, "loss": 2.9662, "step": 1141 }, { "epoch": 5.6002460024600245, "grad_norm": 0.4347855411657189, "learning_rate": 0.00042877112246916235, "loss": 3.4559, "step": 1142 }, { "epoch": 5.605166051660516, "grad_norm": 0.3892162463008815, "learning_rate": 0.0004279851784688099, "loss": 2.8082, "step": 1143 }, { "epoch": 5.610086100861008, "grad_norm": 0.44354659695326576, "learning_rate": 0.00042719941613249767, "loss": 3.7059, "step": 1144 }, { "epoch": 5.6150061500615, "grad_norm": 0.3182868892859793, "learning_rate": 0.0004264138374423835, "loss": 3.3118, "step": 1145 }, { "epoch": 5.619926199261993, "grad_norm": 0.3470073812726253, "learning_rate": 0.00042562844438016206, "loss": 3.095, "step": 1146 }, { "epoch": 5.624846248462484, "grad_norm": 0.5018795859185273, "learning_rate": 0.0004248432389270603, "loss": 3.2517, "step": 1147 }, { "epoch": 5.629766297662977, "grad_norm": 0.4257134952867155, "learning_rate": 0.00042405822306383156, "loss": 3.5777, "step": 1148 }, { "epoch": 5.634686346863469, "grad_norm": 0.4835059851917244, "learning_rate": 0.0004232733987707505, "loss": 3.1701, "step": 1149 }, { "epoch": 5.639606396063961, "grad_norm": 0.48487888383989586, "learning_rate": 0.00042248876802760906, "loss": 3.078, "step": 1150 }, { "epoch": 5.644526445264453, "grad_norm": 0.3543553502092027, "learning_rate": 0.0004217043328137108, "loss": 3.352, "step": 1151 }, { "epoch": 5.649446494464945, "grad_norm": 0.6509912585216572, "learning_rate": 0.00042092009510786584, "loss": 2.5354, "step": 1152 }, { "epoch": 5.654366543665437, "grad_norm": 0.6885246251309031, "learning_rate": 0.0004201360568883865, "loss": 2.5948, "step": 1153 }, { "epoch": 5.659286592865929, "grad_norm": 0.5804204459824612, "learning_rate": 0.00041935222013308153, "loss": 3.3309, "step": 1154 }, { "epoch": 5.6642066420664205, "grad_norm": 1.4645205475966085, "learning_rate": 0.0004185685868192516, "loss": 3.4068, "step": 1155 }, { "epoch": 5.6691266912669125, "grad_norm": 1.2223193849896665, "learning_rate": 0.00041778515892368394, "loss": 2.8415, "step": 1156 }, { "epoch": 5.674046740467404, "grad_norm": 0.696299374230636, "learning_rate": 0.00041700193842264814, "loss": 2.7417, "step": 1157 }, { "epoch": 5.678966789667896, "grad_norm": 0.8693979415913924, "learning_rate": 0.0004162189272918897, "loss": 3.4307, "step": 1158 }, { "epoch": 5.683886838868388, "grad_norm": 0.6234535226228702, "learning_rate": 0.00041543612750662715, "loss": 2.8964, "step": 1159 }, { "epoch": 5.68880688806888, "grad_norm": 0.8699171471932337, "learning_rate": 0.00041465354104154497, "loss": 2.7326, "step": 1160 }, { "epoch": 5.693726937269373, "grad_norm": 1.5925001492287447, "learning_rate": 0.00041387116987078986, "loss": 2.7267, "step": 1161 }, { "epoch": 5.698646986469865, "grad_norm": 0.7240964996595003, "learning_rate": 0.00041308901596796534, "loss": 3.2484, "step": 1162 }, { "epoch": 5.703567035670357, "grad_norm": 1.5681408719098775, "learning_rate": 0.00041230708130612686, "loss": 3.1807, "step": 1163 }, { "epoch": 5.708487084870849, "grad_norm": 2.3450347898920887, "learning_rate": 0.00041152536785777683, "loss": 3.4496, "step": 1164 }, { "epoch": 5.713407134071341, "grad_norm": 1.1583976957721596, "learning_rate": 0.0004107438775948598, "loss": 2.8458, "step": 1165 }, { "epoch": 5.718327183271833, "grad_norm": 0.6281525843157977, "learning_rate": 0.0004099626124887569, "loss": 3.3187, "step": 1166 }, { "epoch": 5.723247232472325, "grad_norm": 0.7775949309981518, "learning_rate": 0.0004091815745102818, "loss": 3.1415, "step": 1167 }, { "epoch": 5.728167281672817, "grad_norm": 1.1509029440660363, "learning_rate": 0.00040840076562967485, "loss": 3.2418, "step": 1168 }, { "epoch": 5.733087330873309, "grad_norm": 0.9376169056934194, "learning_rate": 0.0004076201878165985, "loss": 2.95, "step": 1169 }, { "epoch": 5.7380073800738005, "grad_norm": 0.9062593668662051, "learning_rate": 0.00040683984304013237, "loss": 3.0046, "step": 1170 }, { "epoch": 5.7429274292742925, "grad_norm": 0.39973210389113817, "learning_rate": 0.0004060597332687684, "loss": 2.9545, "step": 1171 }, { "epoch": 5.747847478474784, "grad_norm": 0.7036353427181765, "learning_rate": 0.0004052798604704056, "loss": 2.764, "step": 1172 }, { "epoch": 5.752767527675276, "grad_norm": 0.4564780662560223, "learning_rate": 0.0004045002266123449, "loss": 2.8647, "step": 1173 }, { "epoch": 5.757687576875769, "grad_norm": 0.5609326926527847, "learning_rate": 0.000403720833661285, "loss": 2.8664, "step": 1174 }, { "epoch": 5.762607626076261, "grad_norm": 0.8747818791203325, "learning_rate": 0.00040294168358331645, "loss": 3.0693, "step": 1175 }, { "epoch": 5.767527675276753, "grad_norm": 1.1217024862046066, "learning_rate": 0.00040216277834391706, "loss": 3.3624, "step": 1176 }, { "epoch": 5.772447724477245, "grad_norm": 0.5605453657577469, "learning_rate": 0.0004013841199079479, "loss": 2.755, "step": 1177 }, { "epoch": 5.777367773677737, "grad_norm": 0.5198092069989948, "learning_rate": 0.00040060571023964623, "loss": 2.8011, "step": 1178 }, { "epoch": 5.782287822878229, "grad_norm": 3.0237838367734966, "learning_rate": 0.00039982755130262265, "loss": 3.027, "step": 1179 }, { "epoch": 5.787207872078721, "grad_norm": 1.0622495971291914, "learning_rate": 0.0003990496450598549, "loss": 3.0433, "step": 1180 }, { "epoch": 5.792127921279213, "grad_norm": 0.5408364088572922, "learning_rate": 0.0003982719934736832, "loss": 3.0836, "step": 1181 }, { "epoch": 5.797047970479705, "grad_norm": 0.7392779010378457, "learning_rate": 0.00039749459850580554, "loss": 2.6715, "step": 1182 }, { "epoch": 5.801968019680197, "grad_norm": 0.8670045936732498, "learning_rate": 0.00039671746211727277, "loss": 3.3082, "step": 1183 }, { "epoch": 5.8068880688806885, "grad_norm": 0.5111202332330547, "learning_rate": 0.00039594058626848304, "loss": 2.5013, "step": 1184 }, { "epoch": 5.8118081180811805, "grad_norm": 0.589357575022911, "learning_rate": 0.0003951639729191775, "loss": 3.0804, "step": 1185 }, { "epoch": 5.816728167281672, "grad_norm": 0.864002953481196, "learning_rate": 0.0003943876240284351, "loss": 2.79, "step": 1186 }, { "epoch": 5.821648216482165, "grad_norm": 0.4969401870975754, "learning_rate": 0.00039361154155466763, "loss": 2.741, "step": 1187 }, { "epoch": 5.826568265682657, "grad_norm": 0.8744151829329833, "learning_rate": 0.0003928357274556145, "loss": 3.1963, "step": 1188 }, { "epoch": 5.831488314883149, "grad_norm": 0.6102785697500394, "learning_rate": 0.0003920601836883389, "loss": 2.979, "step": 1189 }, { "epoch": 5.836408364083641, "grad_norm": 0.7629673419498022, "learning_rate": 0.00039128491220922156, "loss": 3.0733, "step": 1190 }, { "epoch": 5.841328413284133, "grad_norm": 0.48002566084689974, "learning_rate": 0.0003905099149739563, "loss": 3.1503, "step": 1191 }, { "epoch": 5.846248462484625, "grad_norm": 0.6799011399485555, "learning_rate": 0.0003897351939375451, "loss": 2.8052, "step": 1192 }, { "epoch": 5.851168511685117, "grad_norm": 0.6724681854791119, "learning_rate": 0.00038896075105429356, "loss": 3.2041, "step": 1193 }, { "epoch": 5.856088560885609, "grad_norm": 0.8569646121050482, "learning_rate": 0.00038818658827780516, "loss": 3.1287, "step": 1194 }, { "epoch": 5.861008610086101, "grad_norm": 0.5961580938976805, "learning_rate": 0.0003874127075609774, "loss": 3.0453, "step": 1195 }, { "epoch": 5.865928659286593, "grad_norm": 1.1056994499484878, "learning_rate": 0.0003866391108559956, "loss": 3.2035, "step": 1196 }, { "epoch": 5.870848708487085, "grad_norm": 0.7847233899718217, "learning_rate": 0.00038586580011432895, "loss": 2.9769, "step": 1197 }, { "epoch": 5.875768757687577, "grad_norm": 0.5502684791127367, "learning_rate": 0.0003850927772867253, "loss": 3.1006, "step": 1198 }, { "epoch": 5.8806888068880685, "grad_norm": 10.675586952016424, "learning_rate": 0.0003843200443232061, "loss": 3.1242, "step": 1199 }, { "epoch": 5.885608856088561, "grad_norm": 0.9680224701264311, "learning_rate": 0.0003835476031730615, "loss": 2.4881, "step": 1200 }, { "epoch": 5.890528905289053, "grad_norm": 1.1983624100429777, "learning_rate": 0.000382775455784846, "loss": 2.8678, "step": 1201 }, { "epoch": 5.895448954489545, "grad_norm": 0.9604079306842668, "learning_rate": 0.00038200360410637236, "loss": 2.8142, "step": 1202 }, { "epoch": 5.900369003690037, "grad_norm": 0.7540184571152743, "learning_rate": 0.00038123205008470815, "loss": 3.2091, "step": 1203 }, { "epoch": 5.905289052890529, "grad_norm": 1.5151962530704328, "learning_rate": 0.0003804607956661692, "loss": 2.6566, "step": 1204 }, { "epoch": 5.910209102091021, "grad_norm": 1.230616103682996, "learning_rate": 0.0003796898427963163, "loss": 2.9462, "step": 1205 }, { "epoch": 5.915129151291513, "grad_norm": 1.052071965605924, "learning_rate": 0.000378919193419949, "loss": 3.0532, "step": 1206 }, { "epoch": 5.920049200492005, "grad_norm": 2.2317526644012813, "learning_rate": 0.0003781488494811022, "loss": 2.8706, "step": 1207 }, { "epoch": 5.924969249692497, "grad_norm": 1.632809347740664, "learning_rate": 0.000377378812923039, "loss": 3.1331, "step": 1208 }, { "epoch": 5.929889298892989, "grad_norm": 1.293045614813907, "learning_rate": 0.000376609085688248, "loss": 2.738, "step": 1209 }, { "epoch": 5.934809348093481, "grad_norm": 1.2955332692079102, "learning_rate": 0.0003758396697184373, "loss": 2.9064, "step": 1210 }, { "epoch": 5.939729397293973, "grad_norm": 1.1056691993198347, "learning_rate": 0.00037507056695452963, "loss": 2.8262, "step": 1211 }, { "epoch": 5.944649446494465, "grad_norm": 1.4913797477270452, "learning_rate": 0.00037430177933665777, "loss": 3.0337, "step": 1212 }, { "epoch": 5.949569495694957, "grad_norm": 1.427180628144125, "learning_rate": 0.0003735333088041596, "loss": 3.0523, "step": 1213 }, { "epoch": 5.9544895448954485, "grad_norm": 1.197906120389189, "learning_rate": 0.00037276515729557305, "loss": 2.881, "step": 1214 }, { "epoch": 5.959409594095941, "grad_norm": 1.373885435047052, "learning_rate": 0.00037199732674863124, "loss": 3.0253, "step": 1215 }, { "epoch": 5.964329643296433, "grad_norm": 1.6738443573270205, "learning_rate": 0.0003712298191002577, "loss": 3.1692, "step": 1216 }, { "epoch": 5.969249692496925, "grad_norm": 0.7598279248114839, "learning_rate": 0.0003704626362865612, "loss": 2.318, "step": 1217 }, { "epoch": 5.974169741697417, "grad_norm": 1.7815363416603949, "learning_rate": 0.0003696957802428312, "loss": 2.8176, "step": 1218 }, { "epoch": 5.979089790897909, "grad_norm": 1.180197806459317, "learning_rate": 0.0003689292529035332, "loss": 2.7928, "step": 1219 }, { "epoch": 5.984009840098401, "grad_norm": 2.663134280124017, "learning_rate": 0.0003681630562023033, "loss": 2.2471, "step": 1220 }, { "epoch": 5.988929889298893, "grad_norm": 1.147782152478927, "learning_rate": 0.0003673971920719431, "loss": 2.9794, "step": 1221 }, { "epoch": 5.993849938499385, "grad_norm": 1.3214776994210629, "learning_rate": 0.0003666316624444158, "loss": 3.0754, "step": 1222 }, { "epoch": 5.998769987699877, "grad_norm": 1.4632153924061089, "learning_rate": 0.00036586646925084056, "loss": 2.2517, "step": 1223 }, { "epoch": 6.0, "grad_norm": 1.4632153924061089, "learning_rate": 0.0003651016144214878, "loss": 0.8113, "step": 1224 }, { "epoch": 6.004920049200492, "grad_norm": 1.2715707375976626, "learning_rate": 0.0003643370998857748, "loss": 2.8786, "step": 1225 }, { "epoch": 6.009840098400984, "grad_norm": 1.1393906034949517, "learning_rate": 0.0003635729275722599, "loss": 3.1635, "step": 1226 }, { "epoch": 6.014760147601476, "grad_norm": 0.8402313054665205, "learning_rate": 0.0003628090994086384, "loss": 2.5789, "step": 1227 }, { "epoch": 6.019680196801968, "grad_norm": 1.0397216233099669, "learning_rate": 0.00036204561732173726, "loss": 3.002, "step": 1228 }, { "epoch": 6.02460024600246, "grad_norm": 1.092535498853554, "learning_rate": 0.0003612824832375109, "loss": 2.6114, "step": 1229 }, { "epoch": 6.029520295202952, "grad_norm": 0.9670794542731984, "learning_rate": 0.00036051969908103493, "loss": 2.6333, "step": 1230 }, { "epoch": 6.0344403444034445, "grad_norm": 1.8763434491182267, "learning_rate": 0.0003597572667765035, "loss": 2.8655, "step": 1231 }, { "epoch": 6.039360393603936, "grad_norm": 1.2894127544709302, "learning_rate": 0.0003589951882472223, "loss": 2.7104, "step": 1232 }, { "epoch": 6.044280442804428, "grad_norm": 1.5924334351197003, "learning_rate": 0.00035823346541560494, "loss": 2.668, "step": 1233 }, { "epoch": 6.04920049200492, "grad_norm": 0.9586876578762002, "learning_rate": 0.00035747210020316733, "loss": 2.454, "step": 1234 }, { "epoch": 6.054120541205412, "grad_norm": 2.2557028332233195, "learning_rate": 0.00035671109453052377, "loss": 2.8331, "step": 1235 }, { "epoch": 6.059040590405904, "grad_norm": 1.0264129538714646, "learning_rate": 0.0003559504503173812, "loss": 3.1307, "step": 1236 }, { "epoch": 6.063960639606396, "grad_norm": 1.3399187740139034, "learning_rate": 0.0003551901694825352, "loss": 2.711, "step": 1237 }, { "epoch": 6.068880688806888, "grad_norm": 3.1376261620182664, "learning_rate": 0.0003544302539438642, "loss": 2.0468, "step": 1238 }, { "epoch": 6.07380073800738, "grad_norm": 1.4244128859698455, "learning_rate": 0.0003536707056183254, "loss": 2.4858, "step": 1239 }, { "epoch": 6.078720787207872, "grad_norm": 0.6703666288789721, "learning_rate": 0.0003529115264219496, "loss": 2.4266, "step": 1240 }, { "epoch": 6.083640836408364, "grad_norm": 0.7454398719467444, "learning_rate": 0.0003521527182698365, "loss": 2.5906, "step": 1241 }, { "epoch": 6.088560885608856, "grad_norm": 6.5116338848521575, "learning_rate": 0.0003513942830761497, "loss": 2.8778, "step": 1242 }, { "epoch": 6.093480934809348, "grad_norm": 12.805716764741517, "learning_rate": 0.0003506362227541121, "loss": 2.529, "step": 1243 }, { "epoch": 6.0984009840098405, "grad_norm": 1.8309127647804633, "learning_rate": 0.00034987853921600087, "loss": 2.4904, "step": 1244 }, { "epoch": 6.1033210332103325, "grad_norm": 1.385227037694236, "learning_rate": 0.0003491212343731427, "loss": 2.6763, "step": 1245 }, { "epoch": 6.108241082410824, "grad_norm": 1.7088965372908156, "learning_rate": 0.00034836431013590924, "loss": 3.237, "step": 1246 }, { "epoch": 6.113161131611316, "grad_norm": 0.8926246203577644, "learning_rate": 0.00034760776841371145, "loss": 2.8256, "step": 1247 }, { "epoch": 6.118081180811808, "grad_norm": 4.61417943673406, "learning_rate": 0.00034685161111499565, "loss": 3.0758, "step": 1248 }, { "epoch": 6.1230012300123, "grad_norm": 1.0171741606244018, "learning_rate": 0.00034609584014723904, "loss": 2.9208, "step": 1249 }, { "epoch": 6.127921279212792, "grad_norm": 1.1062825936129506, "learning_rate": 0.0003453404574169433, "loss": 3.2372, "step": 1250 }, { "epoch": 6.132841328413284, "grad_norm": 1.1970987595522924, "learning_rate": 0.0003445854648296312, "loss": 2.6501, "step": 1251 }, { "epoch": 6.137761377613776, "grad_norm": 0.9518893704118436, "learning_rate": 0.00034383086428984124, "loss": 2.6787, "step": 1252 }, { "epoch": 6.142681426814268, "grad_norm": 0.599400064897036, "learning_rate": 0.00034307665770112305, "loss": 2.401, "step": 1253 }, { "epoch": 6.14760147601476, "grad_norm": 0.9809165651842763, "learning_rate": 0.00034232284696603257, "loss": 2.9279, "step": 1254 }, { "epoch": 6.152521525215252, "grad_norm": 3.0083173901577767, "learning_rate": 0.0003415694339861266, "loss": 2.641, "step": 1255 }, { "epoch": 6.157441574415744, "grad_norm": 0.9222704948892093, "learning_rate": 0.0003408164206619595, "loss": 2.5624, "step": 1256 }, { "epoch": 6.162361623616236, "grad_norm": 0.9963371155547893, "learning_rate": 0.00034006380889307664, "loss": 3.0496, "step": 1257 }, { "epoch": 6.167281672816729, "grad_norm": 0.9742112769038954, "learning_rate": 0.0003393116005780108, "loss": 2.4773, "step": 1258 }, { "epoch": 6.1722017220172205, "grad_norm": 1.132157494483649, "learning_rate": 0.00033855979761427703, "loss": 2.7353, "step": 1259 }, { "epoch": 6.177121771217712, "grad_norm": 1.9400586890364648, "learning_rate": 0.0003378084018983676, "loss": 2.9177, "step": 1260 }, { "epoch": 6.182041820418204, "grad_norm": 1.530574150278278, "learning_rate": 0.0003370574153257474, "loss": 2.8901, "step": 1261 }, { "epoch": 6.186961869618696, "grad_norm": 1.850788378012793, "learning_rate": 0.00033630683979085006, "loss": 3.0138, "step": 1262 }, { "epoch": 6.191881918819188, "grad_norm": 1.3544540818555728, "learning_rate": 0.0003355566771870714, "loss": 2.6998, "step": 1263 }, { "epoch": 6.19680196801968, "grad_norm": 2.695589580730902, "learning_rate": 0.00033480692940676573, "loss": 3.1256, "step": 1264 }, { "epoch": 6.201722017220172, "grad_norm": 2.0636830248431552, "learning_rate": 0.00033405759834124115, "loss": 2.9625, "step": 1265 }, { "epoch": 6.206642066420664, "grad_norm": 2.5391412566208014, "learning_rate": 0.0003333086858807547, "loss": 2.4924, "step": 1266 }, { "epoch": 6.211562115621156, "grad_norm": 1.3454748142192139, "learning_rate": 0.00033256019391450693, "loss": 3.041, "step": 1267 }, { "epoch": 6.216482164821648, "grad_norm": 2.8048310833321786, "learning_rate": 0.00033181212433063825, "loss": 3.2038, "step": 1268 }, { "epoch": 6.22140221402214, "grad_norm": 1.3291676481818184, "learning_rate": 0.0003310644790162232, "loss": 2.8374, "step": 1269 }, { "epoch": 6.226322263222632, "grad_norm": 1.6058694001056157, "learning_rate": 0.00033031725985726613, "loss": 2.7789, "step": 1270 }, { "epoch": 6.231242312423125, "grad_norm": 0.9673170723629201, "learning_rate": 0.00032957046873869646, "loss": 2.7155, "step": 1271 }, { "epoch": 6.236162361623617, "grad_norm": 1.3759013627080179, "learning_rate": 0.00032882410754436374, "loss": 2.8854, "step": 1272 }, { "epoch": 6.2410824108241085, "grad_norm": 1.860153262926024, "learning_rate": 0.0003280781781570328, "loss": 2.9654, "step": 1273 }, { "epoch": 6.2460024600246005, "grad_norm": 0.8726904977201452, "learning_rate": 0.0003273326824583798, "loss": 2.7978, "step": 1274 }, { "epoch": 6.250922509225092, "grad_norm": 0.96554628820015, "learning_rate": 0.00032658762232898645, "loss": 2.7573, "step": 1275 }, { "epoch": 6.255842558425584, "grad_norm": 1.1091645343874141, "learning_rate": 0.00032584299964833553, "loss": 2.8419, "step": 1276 }, { "epoch": 6.260762607626076, "grad_norm": 0.9423019447395738, "learning_rate": 0.0003250988162948067, "loss": 2.5156, "step": 1277 }, { "epoch": 6.265682656826568, "grad_norm": 5.622179778083354, "learning_rate": 0.0003243550741456712, "loss": 2.9053, "step": 1278 }, { "epoch": 6.27060270602706, "grad_norm": 1.4140948314283581, "learning_rate": 0.00032361177507708716, "loss": 2.3533, "step": 1279 }, { "epoch": 6.275522755227552, "grad_norm": 0.9001699725268252, "learning_rate": 0.0003228689209640954, "loss": 2.5643, "step": 1280 }, { "epoch": 6.280442804428044, "grad_norm": 1.2310925162148039, "learning_rate": 0.0003221265136806139, "loss": 2.8278, "step": 1281 }, { "epoch": 6.285362853628536, "grad_norm": 0.6765796095818714, "learning_rate": 0.0003213845550994337, "loss": 2.5776, "step": 1282 }, { "epoch": 6.290282902829028, "grad_norm": 0.9935237034482662, "learning_rate": 0.0003206430470922137, "loss": 3.0101, "step": 1283 }, { "epoch": 6.29520295202952, "grad_norm": 1.2521249007860638, "learning_rate": 0.0003199019915294765, "loss": 2.7852, "step": 1284 }, { "epoch": 6.300123001230013, "grad_norm": 3.019923739214347, "learning_rate": 0.0003191613902806031, "loss": 2.7643, "step": 1285 }, { "epoch": 6.305043050430505, "grad_norm": 1.0775931109041912, "learning_rate": 0.0003184212452138288, "loss": 2.7141, "step": 1286 }, { "epoch": 6.3099630996309966, "grad_norm": 1.2731256122336412, "learning_rate": 0.00031768155819623766, "loss": 2.5212, "step": 1287 }, { "epoch": 6.3148831488314885, "grad_norm": 1.3180575473417462, "learning_rate": 0.0003169423310937587, "loss": 2.5236, "step": 1288 }, { "epoch": 6.31980319803198, "grad_norm": 1.1305860207864622, "learning_rate": 0.00031620356577116037, "loss": 2.5944, "step": 1289 }, { "epoch": 6.324723247232472, "grad_norm": 3.8750381669890617, "learning_rate": 0.0003154652640920466, "loss": 2.4685, "step": 1290 }, { "epoch": 6.329643296432964, "grad_norm": 1.6477805596714563, "learning_rate": 0.00031472742791885123, "loss": 2.7498, "step": 1291 }, { "epoch": 6.334563345633456, "grad_norm": 1.3476333272482584, "learning_rate": 0.00031399005911283485, "loss": 2.817, "step": 1292 }, { "epoch": 6.339483394833948, "grad_norm": 5.4562878275642, "learning_rate": 0.00031325315953407783, "loss": 2.8098, "step": 1293 }, { "epoch": 6.34440344403444, "grad_norm": 1.37075887874305, "learning_rate": 0.0003125167310414777, "loss": 2.5362, "step": 1294 }, { "epoch": 6.349323493234932, "grad_norm": 1.8162596917202158, "learning_rate": 0.00031178077549274326, "loss": 2.7216, "step": 1295 }, { "epoch": 6.354243542435424, "grad_norm": 1.1483880048722346, "learning_rate": 0.0003110452947443904, "loss": 2.9668, "step": 1296 }, { "epoch": 6.359163591635916, "grad_norm": 2.250259253676778, "learning_rate": 0.00031031029065173706, "loss": 2.9602, "step": 1297 }, { "epoch": 6.364083640836409, "grad_norm": 1.261541736947044, "learning_rate": 0.00030957576506889927, "loss": 2.8173, "step": 1298 }, { "epoch": 6.369003690036901, "grad_norm": 1.1570417792362047, "learning_rate": 0.0003088417198487855, "loss": 2.649, "step": 1299 }, { "epoch": 6.373923739237393, "grad_norm": 1.1139623438475654, "learning_rate": 0.0003081081568430927, "loss": 2.9663, "step": 1300 }, { "epoch": 6.378843788437885, "grad_norm": 1.1566855852279916, "learning_rate": 0.0003073750779023014, "loss": 2.5699, "step": 1301 }, { "epoch": 6.3837638376383765, "grad_norm": 1.0052089325975895, "learning_rate": 0.0003066424848756707, "loss": 2.8305, "step": 1302 }, { "epoch": 6.3886838868388685, "grad_norm": 1.659661165473321, "learning_rate": 0.00030591037961123416, "loss": 2.5646, "step": 1303 }, { "epoch": 6.39360393603936, "grad_norm": 1.8577897677926398, "learning_rate": 0.00030517876395579524, "loss": 2.4171, "step": 1304 }, { "epoch": 6.398523985239852, "grad_norm": 1.2795065890176769, "learning_rate": 0.00030444763975492206, "loss": 2.9543, "step": 1305 }, { "epoch": 6.403444034440344, "grad_norm": 1.2584001002983176, "learning_rate": 0.00030371700885294264, "loss": 2.7869, "step": 1306 }, { "epoch": 6.408364083640836, "grad_norm": 0.6884191507874449, "learning_rate": 0.00030298687309294103, "loss": 2.0909, "step": 1307 }, { "epoch": 6.413284132841328, "grad_norm": 1.0817283713692791, "learning_rate": 0.0003022572343167522, "loss": 2.7259, "step": 1308 }, { "epoch": 6.41820418204182, "grad_norm": 1.3350498916048206, "learning_rate": 0.0003015280943649573, "loss": 2.4704, "step": 1309 }, { "epoch": 6.423124231242312, "grad_norm": 0.970968361064511, "learning_rate": 0.00030079945507687926, "loss": 2.4175, "step": 1310 }, { "epoch": 6.428044280442805, "grad_norm": 1.1586256582665484, "learning_rate": 0.00030007131829057804, "loss": 2.665, "step": 1311 }, { "epoch": 6.432964329643297, "grad_norm": 0.767889302204255, "learning_rate": 0.0002993436858428459, "loss": 2.5911, "step": 1312 }, { "epoch": 6.437884378843789, "grad_norm": 0.7477905480054344, "learning_rate": 0.00029861655956920285, "loss": 1.8754, "step": 1313 }, { "epoch": 6.442804428044281, "grad_norm": 2.5527565115483184, "learning_rate": 0.00029788994130389233, "loss": 2.4886, "step": 1314 }, { "epoch": 6.447724477244773, "grad_norm": 2.7274986447286733, "learning_rate": 0.00029716383287987545, "loss": 2.8288, "step": 1315 }, { "epoch": 6.4526445264452645, "grad_norm": 0.7821846826744107, "learning_rate": 0.00029643823612882846, "loss": 2.3219, "step": 1316 }, { "epoch": 6.4575645756457565, "grad_norm": 1.4974293602668711, "learning_rate": 0.000295713152881136, "loss": 2.6688, "step": 1317 }, { "epoch": 6.462484624846248, "grad_norm": 1.2807160322639262, "learning_rate": 0.0002949885849658874, "loss": 2.5217, "step": 1318 }, { "epoch": 6.46740467404674, "grad_norm": 0.820710626336044, "learning_rate": 0.0002942645342108723, "loss": 2.5769, "step": 1319 }, { "epoch": 6.472324723247232, "grad_norm": 0.9906011123118152, "learning_rate": 0.0002935410024425754, "loss": 2.3632, "step": 1320 }, { "epoch": 6.477244772447724, "grad_norm": 0.8011828946549142, "learning_rate": 0.0002928179914861726, "loss": 2.063, "step": 1321 }, { "epoch": 6.482164821648216, "grad_norm": 0.8450013280390781, "learning_rate": 0.0002920955031655259, "loss": 2.6203, "step": 1322 }, { "epoch": 6.487084870848708, "grad_norm": 1.2440689887680878, "learning_rate": 0.00029137353930317855, "loss": 2.1069, "step": 1323 }, { "epoch": 6.492004920049201, "grad_norm": 1.40066668879312, "learning_rate": 0.0002906521017203515, "loss": 2.5173, "step": 1324 }, { "epoch": 6.496924969249693, "grad_norm": 1.3803328968474082, "learning_rate": 0.0002899311922369375, "loss": 2.2217, "step": 1325 }, { "epoch": 6.501845018450185, "grad_norm": 1.3019635788672341, "learning_rate": 0.0002892108126714971, "loss": 2.6106, "step": 1326 }, { "epoch": 6.506765067650677, "grad_norm": 0.6868307843820086, "learning_rate": 0.0002884909648412545, "loss": 2.85, "step": 1327 }, { "epoch": 6.511685116851169, "grad_norm": 1.658410110446663, "learning_rate": 0.0002877716505620925, "loss": 2.4416, "step": 1328 }, { "epoch": 6.516605166051661, "grad_norm": 1.017664100567637, "learning_rate": 0.00028705287164854755, "loss": 2.4813, "step": 1329 }, { "epoch": 6.521525215252153, "grad_norm": 0.9153814606658844, "learning_rate": 0.00028633462991380613, "loss": 2.8199, "step": 1330 }, { "epoch": 6.5264452644526445, "grad_norm": 1.0229824424605538, "learning_rate": 0.0002856169271696991, "loss": 2.3999, "step": 1331 }, { "epoch": 6.531365313653136, "grad_norm": 0.8365520148071467, "learning_rate": 0.0002848997652266981, "loss": 2.4198, "step": 1332 }, { "epoch": 6.536285362853628, "grad_norm": 1.5300162157350063, "learning_rate": 0.00028418314589391026, "loss": 2.7259, "step": 1333 }, { "epoch": 6.54120541205412, "grad_norm": 1.2125663803364484, "learning_rate": 0.0002834670709790741, "loss": 2.309, "step": 1334 }, { "epoch": 6.546125461254612, "grad_norm": 1.8688208176876173, "learning_rate": 0.0002827515422885549, "loss": 2.4824, "step": 1335 }, { "epoch": 6.551045510455104, "grad_norm": 1.476122352778993, "learning_rate": 0.0002820365616273396, "loss": 2.8408, "step": 1336 }, { "epoch": 6.555965559655597, "grad_norm": 1.4669779253872348, "learning_rate": 0.00028132213079903336, "loss": 2.5853, "step": 1337 }, { "epoch": 6.560885608856088, "grad_norm": 3.4805980458749053, "learning_rate": 0.0002806082516058536, "loss": 2.7307, "step": 1338 }, { "epoch": 6.565805658056581, "grad_norm": 0.8672147934042996, "learning_rate": 0.0002798949258486263, "loss": 2.6764, "step": 1339 }, { "epoch": 6.570725707257073, "grad_norm": 3.511597102328714, "learning_rate": 0.0002791821553267823, "loss": 2.7551, "step": 1340 }, { "epoch": 6.575645756457565, "grad_norm": 0.7901361836530609, "learning_rate": 0.00027846994183835074, "loss": 2.5452, "step": 1341 }, { "epoch": 6.580565805658057, "grad_norm": 3.35633355044932, "learning_rate": 0.00027775828717995567, "loss": 2.6385, "step": 1342 }, { "epoch": 6.585485854858549, "grad_norm": 1.1852813692515198, "learning_rate": 0.00027704719314681207, "loss": 2.8857, "step": 1343 }, { "epoch": 6.590405904059041, "grad_norm": 3.3181735889229214, "learning_rate": 0.00027633666153272006, "loss": 2.4976, "step": 1344 }, { "epoch": 6.5953259532595325, "grad_norm": 1.801140031012645, "learning_rate": 0.0002756266941300615, "loss": 2.6768, "step": 1345 }, { "epoch": 6.6002460024600245, "grad_norm": 1.7153828076790059, "learning_rate": 0.00027491729272979473, "loss": 2.6484, "step": 1346 }, { "epoch": 6.605166051660516, "grad_norm": 3.203173808767358, "learning_rate": 0.0002742084591214501, "loss": 2.698, "step": 1347 }, { "epoch": 6.610086100861008, "grad_norm": 1.6550213266036071, "learning_rate": 0.0002735001950931262, "loss": 2.7336, "step": 1348 }, { "epoch": 6.6150061500615, "grad_norm": 1.612793781148384, "learning_rate": 0.0002727925024314841, "loss": 2.7516, "step": 1349 }, { "epoch": 6.619926199261993, "grad_norm": 2.0759699237758817, "learning_rate": 0.0002720853829217443, "loss": 2.486, "step": 1350 }, { "epoch": 6.624846248462484, "grad_norm": 1.9550841559669652, "learning_rate": 0.00027137883834768073, "loss": 2.7317, "step": 1351 }, { "epoch": 6.629766297662977, "grad_norm": 1.0917103183831438, "learning_rate": 0.0002706728704916175, "loss": 2.6653, "step": 1352 }, { "epoch": 6.634686346863469, "grad_norm": 1.7040761799959754, "learning_rate": 0.00026996748113442394, "loss": 2.2147, "step": 1353 }, { "epoch": 6.639606396063961, "grad_norm": 2.520356554382261, "learning_rate": 0.0002692626720555097, "loss": 2.6831, "step": 1354 }, { "epoch": 6.644526445264453, "grad_norm": 2.2378803735606407, "learning_rate": 0.00026855844503282056, "loss": 2.8208, "step": 1355 }, { "epoch": 6.649446494464945, "grad_norm": 2.7334039747234393, "learning_rate": 0.00026785480184283474, "loss": 2.7701, "step": 1356 }, { "epoch": 6.654366543665437, "grad_norm": 7.0898233117669704, "learning_rate": 0.0002671517442605566, "loss": 2.3932, "step": 1357 }, { "epoch": 6.659286592865929, "grad_norm": 1.3055342370064242, "learning_rate": 0.00026644927405951427, "loss": 2.2486, "step": 1358 }, { "epoch": 6.6642066420664205, "grad_norm": 0.6577253197406228, "learning_rate": 0.00026574739301175366, "loss": 2.5234, "step": 1359 }, { "epoch": 6.6691266912669125, "grad_norm": 1.483405050378105, "learning_rate": 0.0002650461028878344, "loss": 2.8054, "step": 1360 }, { "epoch": 6.674046740467404, "grad_norm": 2.2886112193497476, "learning_rate": 0.00026434540545682586, "loss": 2.9222, "step": 1361 }, { "epoch": 6.678966789667896, "grad_norm": 2.3362743584674397, "learning_rate": 0.0002636453024863017, "loss": 2.2556, "step": 1362 }, { "epoch": 6.683886838868388, "grad_norm": 2.116236623785489, "learning_rate": 0.0002629457957423365, "loss": 2.3975, "step": 1363 }, { "epoch": 6.68880688806888, "grad_norm": 1.6267130080004553, "learning_rate": 0.00026224688698950094, "loss": 2.5446, "step": 1364 }, { "epoch": 6.693726937269373, "grad_norm": 1.075089592119318, "learning_rate": 0.0002615485779908564, "loss": 2.3649, "step": 1365 }, { "epoch": 6.698646986469865, "grad_norm": 0.8977945591488151, "learning_rate": 0.0002608508705079523, "loss": 2.3904, "step": 1366 }, { "epoch": 6.703567035670357, "grad_norm": 1.24171889036448, "learning_rate": 0.00026015376630082, "loss": 2.2857, "step": 1367 }, { "epoch": 6.708487084870849, "grad_norm": 1.6228422185531852, "learning_rate": 0.0002594572671279691, "loss": 2.6924, "step": 1368 }, { "epoch": 6.713407134071341, "grad_norm": 1.0081054485286745, "learning_rate": 0.0002587613747463832, "loss": 2.5881, "step": 1369 }, { "epoch": 6.718327183271833, "grad_norm": 0.8754437752556458, "learning_rate": 0.00025806609091151545, "loss": 2.0792, "step": 1370 }, { "epoch": 6.723247232472325, "grad_norm": 1.2383497761713083, "learning_rate": 0.00025737141737728307, "loss": 2.7867, "step": 1371 }, { "epoch": 6.728167281672817, "grad_norm": 1.5334597724659726, "learning_rate": 0.00025667735589606465, "loss": 2.2718, "step": 1372 }, { "epoch": 6.733087330873309, "grad_norm": 0.9177989662392823, "learning_rate": 0.000255983908218694, "loss": 2.5065, "step": 1373 }, { "epoch": 6.7380073800738005, "grad_norm": 1.6501544768559384, "learning_rate": 0.00025529107609445733, "loss": 2.7184, "step": 1374 }, { "epoch": 6.7429274292742925, "grad_norm": 1.0342042338796615, "learning_rate": 0.00025459886127108736, "loss": 2.2444, "step": 1375 }, { "epoch": 6.747847478474784, "grad_norm": 0.9067834277535822, "learning_rate": 0.00025390726549476, "loss": 2.7299, "step": 1376 }, { "epoch": 6.752767527675276, "grad_norm": 1.3036080595962558, "learning_rate": 0.0002532162905100898, "loss": 1.9737, "step": 1377 }, { "epoch": 6.757687576875769, "grad_norm": 2.2311658509785293, "learning_rate": 0.00025252593806012445, "loss": 1.9152, "step": 1378 }, { "epoch": 6.762607626076261, "grad_norm": 1.2190867364918592, "learning_rate": 0.00025183620988634227, "loss": 2.8661, "step": 1379 }, { "epoch": 6.767527675276753, "grad_norm": 0.8177097850709304, "learning_rate": 0.0002511471077286461, "loss": 2.6576, "step": 1380 }, { "epoch": 6.772447724477245, "grad_norm": 0.8345134264827838, "learning_rate": 0.00025045863332535945, "loss": 2.7083, "step": 1381 }, { "epoch": 6.777367773677737, "grad_norm": 0.8265293645228095, "learning_rate": 0.0002497707884132233, "loss": 2.4134, "step": 1382 }, { "epoch": 6.782287822878229, "grad_norm": 6.819426229377848, "learning_rate": 0.0002490835747273896, "loss": 2.907, "step": 1383 }, { "epoch": 6.787207872078721, "grad_norm": 0.6963140578142731, "learning_rate": 0.0002483969940014182, "loss": 2.7242, "step": 1384 }, { "epoch": 6.792127921279213, "grad_norm": 1.1466482456344442, "learning_rate": 0.00024771104796727275, "loss": 2.3707, "step": 1385 }, { "epoch": 6.797047970479705, "grad_norm": 1.4563649118496231, "learning_rate": 0.0002470257383553151, "loss": 2.6461, "step": 1386 }, { "epoch": 6.801968019680197, "grad_norm": 1.0879411610368663, "learning_rate": 0.00024634106689430234, "loss": 2.5324, "step": 1387 }, { "epoch": 6.8068880688806885, "grad_norm": 1.3724587100929702, "learning_rate": 0.0002456570353113816, "loss": 2.4805, "step": 1388 }, { "epoch": 6.8118081180811805, "grad_norm": 1.041413998497983, "learning_rate": 0.0002449736453320854, "loss": 2.2664, "step": 1389 }, { "epoch": 6.816728167281672, "grad_norm": 0.931808815437349, "learning_rate": 0.0002442908986803284, "loss": 2.1601, "step": 1390 }, { "epoch": 6.821648216482165, "grad_norm": 1.1484660156421374, "learning_rate": 0.00024360879707840177, "loss": 2.3484, "step": 1391 }, { "epoch": 6.826568265682657, "grad_norm": 0.8589585938187826, "learning_rate": 0.00024292734224697023, "loss": 2.2543, "step": 1392 }, { "epoch": 6.831488314883149, "grad_norm": 1.2485843925053202, "learning_rate": 0.00024224653590506606, "loss": 2.4372, "step": 1393 }, { "epoch": 6.836408364083641, "grad_norm": 1.6488249563937756, "learning_rate": 0.00024156637977008644, "loss": 1.85, "step": 1394 }, { "epoch": 6.841328413284133, "grad_norm": 1.299784975519758, "learning_rate": 0.00024088687555778822, "loss": 2.5343, "step": 1395 }, { "epoch": 6.846248462484625, "grad_norm": 1.0641223476944544, "learning_rate": 0.00024020802498228333, "loss": 1.9917, "step": 1396 }, { "epoch": 6.851168511685117, "grad_norm": 1.7858343798671958, "learning_rate": 0.00023952982975603492, "loss": 2.5863, "step": 1397 }, { "epoch": 6.856088560885609, "grad_norm": 2.145461618909124, "learning_rate": 0.0002388522915898534, "loss": 2.6793, "step": 1398 }, { "epoch": 6.861008610086101, "grad_norm": 1.1082721706013121, "learning_rate": 0.0002381754121928909, "loss": 2.0178, "step": 1399 }, { "epoch": 6.865928659286593, "grad_norm": 1.717888810669436, "learning_rate": 0.00023749919327263893, "loss": 2.1034, "step": 1400 }, { "epoch": 6.870848708487085, "grad_norm": 1.7377545592989154, "learning_rate": 0.00023682363653492177, "loss": 2.6252, "step": 1401 }, { "epoch": 6.875768757687577, "grad_norm": 0.986293310240004, "learning_rate": 0.00023614874368389367, "loss": 2.463, "step": 1402 }, { "epoch": 6.8806888068880685, "grad_norm": 1.3340377440029285, "learning_rate": 0.00023547451642203437, "loss": 2.5646, "step": 1403 }, { "epoch": 6.885608856088561, "grad_norm": 1.161659188410901, "learning_rate": 0.0002348009564501442, "loss": 2.1142, "step": 1404 }, { "epoch": 6.890528905289053, "grad_norm": 1.2104091843000853, "learning_rate": 0.00023412806546734057, "loss": 2.1997, "step": 1405 }, { "epoch": 6.895448954489545, "grad_norm": 1.0659659472399317, "learning_rate": 0.0002334558451710533, "loss": 2.5127, "step": 1406 }, { "epoch": 6.900369003690037, "grad_norm": 0.9396111132006758, "learning_rate": 0.00023278429725701977, "loss": 2.7071, "step": 1407 }, { "epoch": 6.905289052890529, "grad_norm": 0.7783064862075181, "learning_rate": 0.00023211342341928205, "loss": 1.9929, "step": 1408 }, { "epoch": 6.910209102091021, "grad_norm": 0.8536265924241964, "learning_rate": 0.00023144322535018126, "loss": 2.5974, "step": 1409 }, { "epoch": 6.915129151291513, "grad_norm": 1.8757414095153349, "learning_rate": 0.0002307737047403536, "loss": 2.3365, "step": 1410 }, { "epoch": 6.920049200492005, "grad_norm": 1.8929502773190103, "learning_rate": 0.00023010486327872698, "loss": 2.1208, "step": 1411 }, { "epoch": 6.924969249692497, "grad_norm": 1.435676803628705, "learning_rate": 0.0002294367026525157, "loss": 1.6119, "step": 1412 }, { "epoch": 6.929889298892989, "grad_norm": 1.2786785042745816, "learning_rate": 0.00022876922454721693, "loss": 2.1984, "step": 1413 }, { "epoch": 6.934809348093481, "grad_norm": 0.9003578915326215, "learning_rate": 0.00022810243064660558, "loss": 2.3312, "step": 1414 }, { "epoch": 6.939729397293973, "grad_norm": 2.6837451422664316, "learning_rate": 0.00022743632263273072, "loss": 2.6497, "step": 1415 }, { "epoch": 6.944649446494465, "grad_norm": 1.334271002023889, "learning_rate": 0.00022677090218591167, "loss": 2.1397, "step": 1416 }, { "epoch": 6.949569495694957, "grad_norm": 0.9729429017077861, "learning_rate": 0.00022610617098473267, "loss": 2.4084, "step": 1417 }, { "epoch": 6.9544895448954485, "grad_norm": 1.193989416488621, "learning_rate": 0.0002254421307060398, "loss": 2.6536, "step": 1418 }, { "epoch": 6.959409594095941, "grad_norm": 1.2593852644973018, "learning_rate": 0.0002247787830249361, "loss": 2.5529, "step": 1419 }, { "epoch": 6.964329643296433, "grad_norm": 1.2584303985760417, "learning_rate": 0.000224116129614777, "loss": 2.3353, "step": 1420 }, { "epoch": 6.969249692496925, "grad_norm": 0.7959391644995951, "learning_rate": 0.00022345417214716744, "loss": 2.2356, "step": 1421 }, { "epoch": 6.974169741697417, "grad_norm": 1.1587260033732396, "learning_rate": 0.00022279291229195599, "loss": 2.4997, "step": 1422 }, { "epoch": 6.979089790897909, "grad_norm": 1.504614098445438, "learning_rate": 0.00022213235171723134, "loss": 2.444, "step": 1423 }, { "epoch": 6.984009840098401, "grad_norm": 1.566828055322309, "learning_rate": 0.00022147249208931942, "loss": 2.1007, "step": 1424 }, { "epoch": 6.988929889298893, "grad_norm": 2.9467116109513793, "learning_rate": 0.0002208133350727764, "loss": 2.1695, "step": 1425 }, { "epoch": 6.993849938499385, "grad_norm": 0.8015899747948362, "learning_rate": 0.0002201548823303872, "loss": 2.3599, "step": 1426 }, { "epoch": 6.998769987699877, "grad_norm": 0.9027147641754285, "learning_rate": 0.00021949713552315948, "loss": 2.5369, "step": 1427 }, { "epoch": 7.0, "grad_norm": 0.9027147641754285, "learning_rate": 0.00021884009631032014, "loss": 0.5971, "step": 1428 }, { "epoch": 7.004920049200492, "grad_norm": 1.4129867467643002, "learning_rate": 0.00021818376634931153, "loss": 2.4858, "step": 1429 }, { "epoch": 7.009840098400984, "grad_norm": 2.0362035809258487, "learning_rate": 0.00021752814729578674, "loss": 2.0223, "step": 1430 }, { "epoch": 7.014760147601476, "grad_norm": 2.4596856072213957, "learning_rate": 0.00021687324080360505, "loss": 2.3125, "step": 1431 }, { "epoch": 7.019680196801968, "grad_norm": 1.25146693215718, "learning_rate": 0.00021621904852482887, "loss": 2.2659, "step": 1432 }, { "epoch": 7.02460024600246, "grad_norm": 1.541727753763104, "learning_rate": 0.00021556557210971843, "loss": 2.0997, "step": 1433 }, { "epoch": 7.029520295202952, "grad_norm": 1.888758420941707, "learning_rate": 0.00021491281320672867, "loss": 2.4571, "step": 1434 }, { "epoch": 7.0344403444034445, "grad_norm": 1.3748291035829516, "learning_rate": 0.00021426077346250384, "loss": 1.84, "step": 1435 }, { "epoch": 7.039360393603936, "grad_norm": 0.9888412965437223, "learning_rate": 0.00021360945452187459, "loss": 2.2602, "step": 1436 }, { "epoch": 7.044280442804428, "grad_norm": 1.4980057303100422, "learning_rate": 0.00021295885802785332, "loss": 2.1745, "step": 1437 }, { "epoch": 7.04920049200492, "grad_norm": 1.2574417695668458, "learning_rate": 0.00021230898562162942, "loss": 2.1035, "step": 1438 }, { "epoch": 7.054120541205412, "grad_norm": 0.9366081444947427, "learning_rate": 0.00021165983894256646, "loss": 2.1814, "step": 1439 }, { "epoch": 7.059040590405904, "grad_norm": 1.0318904165346279, "learning_rate": 0.00021101141962819676, "loss": 2.4839, "step": 1440 }, { "epoch": 7.063960639606396, "grad_norm": 1.0363482569990952, "learning_rate": 0.00021036372931421748, "loss": 2.6185, "step": 1441 }, { "epoch": 7.068880688806888, "grad_norm": 1.1725191809157007, "learning_rate": 0.00020971676963448804, "loss": 2.3972, "step": 1442 }, { "epoch": 7.07380073800738, "grad_norm": 1.0828032110486845, "learning_rate": 0.00020907054222102367, "loss": 2.5699, "step": 1443 }, { "epoch": 7.078720787207872, "grad_norm": 1.897130189419103, "learning_rate": 0.00020842504870399248, "loss": 2.4239, "step": 1444 }, { "epoch": 7.083640836408364, "grad_norm": 1.2646203583221718, "learning_rate": 0.00020778029071171188, "loss": 2.4873, "step": 1445 }, { "epoch": 7.088560885608856, "grad_norm": 0.6379728829703689, "learning_rate": 0.00020713626987064316, "loss": 2.4547, "step": 1446 }, { "epoch": 7.093480934809348, "grad_norm": 2.3024920761251186, "learning_rate": 0.0002064929878053885, "loss": 2.4828, "step": 1447 }, { "epoch": 7.0984009840098405, "grad_norm": 1.7318595926902343, "learning_rate": 0.00020585044613868658, "loss": 2.1544, "step": 1448 }, { "epoch": 7.1033210332103325, "grad_norm": 1.1149158172316123, "learning_rate": 0.00020520864649140763, "loss": 2.1407, "step": 1449 }, { "epoch": 7.108241082410824, "grad_norm": 1.108424871019143, "learning_rate": 0.00020456759048255107, "loss": 2.2728, "step": 1450 }, { "epoch": 7.113161131611316, "grad_norm": 1.046986720166366, "learning_rate": 0.0002039272797292394, "loss": 1.9311, "step": 1451 }, { "epoch": 7.118081180811808, "grad_norm": 1.321306492919154, "learning_rate": 0.00020328771584671613, "loss": 1.8751, "step": 1452 }, { "epoch": 7.1230012300123, "grad_norm": 1.2478067467430103, "learning_rate": 0.00020264890044833996, "loss": 1.8809, "step": 1453 }, { "epoch": 7.127921279212792, "grad_norm": 2.1493838360525293, "learning_rate": 0.00020201083514558143, "loss": 2.0908, "step": 1454 }, { "epoch": 7.132841328413284, "grad_norm": 2.1883677016137613, "learning_rate": 0.0002013735215480199, "loss": 2.2702, "step": 1455 }, { "epoch": 7.137761377613776, "grad_norm": 1.4979690956427818, "learning_rate": 0.0002007369612633375, "loss": 1.9488, "step": 1456 }, { "epoch": 7.142681426814268, "grad_norm": 1.482529843474224, "learning_rate": 0.00020010115589731613, "loss": 2.1134, "step": 1457 }, { "epoch": 7.14760147601476, "grad_norm": 1.4099356588831977, "learning_rate": 0.00019946610705383396, "loss": 1.9134, "step": 1458 }, { "epoch": 7.152521525215252, "grad_norm": 1.174731768696012, "learning_rate": 0.0001988318163348599, "loss": 2.414, "step": 1459 }, { "epoch": 7.157441574415744, "grad_norm": 1.175093687988007, "learning_rate": 0.0001981982853404512, "loss": 2.1186, "step": 1460 }, { "epoch": 7.162361623616236, "grad_norm": 1.9149877410347815, "learning_rate": 0.00019756551566874837, "loss": 2.2254, "step": 1461 }, { "epoch": 7.167281672816729, "grad_norm": 1.0310934965868699, "learning_rate": 0.00019693350891597113, "loss": 2.3453, "step": 1462 }, { "epoch": 7.1722017220172205, "grad_norm": 2.5780072806730416, "learning_rate": 0.00019630226667641514, "loss": 2.1589, "step": 1463 }, { "epoch": 7.177121771217712, "grad_norm": 1.7893001308779062, "learning_rate": 0.00019567179054244698, "loss": 2.2807, "step": 1464 }, { "epoch": 7.182041820418204, "grad_norm": 0.8466212487627339, "learning_rate": 0.00019504208210450124, "loss": 2.3012, "step": 1465 }, { "epoch": 7.186961869618696, "grad_norm": 1.1305103196043627, "learning_rate": 0.00019441314295107537, "loss": 2.0161, "step": 1466 }, { "epoch": 7.191881918819188, "grad_norm": 1.3426704794026334, "learning_rate": 0.00019378497466872657, "loss": 2.1754, "step": 1467 }, { "epoch": 7.19680196801968, "grad_norm": 1.6227384769709832, "learning_rate": 0.00019315757884206757, "loss": 1.7929, "step": 1468 }, { "epoch": 7.201722017220172, "grad_norm": 1.2161326345324452, "learning_rate": 0.00019253095705376217, "loss": 2.273, "step": 1469 }, { "epoch": 7.206642066420664, "grad_norm": 1.0819112784315619, "learning_rate": 0.00019190511088452145, "loss": 2.2744, "step": 1470 }, { "epoch": 7.211562115621156, "grad_norm": 2.126610981871707, "learning_rate": 0.00019128004191310062, "loss": 2.2675, "step": 1471 }, { "epoch": 7.216482164821648, "grad_norm": 1.1104171468645891, "learning_rate": 0.0001906557517162935, "loss": 1.9552, "step": 1472 }, { "epoch": 7.22140221402214, "grad_norm": 0.8270119107445274, "learning_rate": 0.00019003224186892997, "loss": 2.1249, "step": 1473 }, { "epoch": 7.226322263222632, "grad_norm": 1.7657759117430984, "learning_rate": 0.00018940951394387146, "loss": 2.115, "step": 1474 }, { "epoch": 7.231242312423125, "grad_norm": 1.0711513510401762, "learning_rate": 0.00018878756951200626, "loss": 1.5788, "step": 1475 }, { "epoch": 7.236162361623617, "grad_norm": 0.8436540893120502, "learning_rate": 0.00018816641014224702, "loss": 2.5179, "step": 1476 }, { "epoch": 7.2410824108241085, "grad_norm": 0.9791233022412463, "learning_rate": 0.00018754603740152531, "loss": 1.9702, "step": 1477 }, { "epoch": 7.2460024600246005, "grad_norm": 1.3340098042290514, "learning_rate": 0.00018692645285478882, "loss": 2.3675, "step": 1478 }, { "epoch": 7.250922509225092, "grad_norm": 1.6198670290132193, "learning_rate": 0.000186307658064997, "loss": 2.4627, "step": 1479 }, { "epoch": 7.255842558425584, "grad_norm": 1.1253240124117427, "learning_rate": 0.00018568965459311653, "loss": 2.5437, "step": 1480 }, { "epoch": 7.260762607626076, "grad_norm": 2.306175921414552, "learning_rate": 0.00018507244399811856, "loss": 2.1191, "step": 1481 }, { "epoch": 7.265682656826568, "grad_norm": 1.1337774148418218, "learning_rate": 0.00018445602783697374, "loss": 1.9716, "step": 1482 }, { "epoch": 7.27060270602706, "grad_norm": 1.0147885601929878, "learning_rate": 0.00018384040766464854, "loss": 2.285, "step": 1483 }, { "epoch": 7.275522755227552, "grad_norm": 1.4848470497353499, "learning_rate": 0.00018322558503410197, "loss": 1.9407, "step": 1484 }, { "epoch": 7.280442804428044, "grad_norm": 1.2208682456180604, "learning_rate": 0.000182611561496281, "loss": 1.6875, "step": 1485 }, { "epoch": 7.285362853628536, "grad_norm": 2.6168084393550797, "learning_rate": 0.0001819983386001164, "loss": 2.3791, "step": 1486 }, { "epoch": 7.290282902829028, "grad_norm": 1.3746981920313504, "learning_rate": 0.00018138591789251997, "loss": 2.4157, "step": 1487 }, { "epoch": 7.29520295202952, "grad_norm": 1.2448883421511918, "learning_rate": 0.0001807743009183791, "loss": 2.2129, "step": 1488 }, { "epoch": 7.300123001230013, "grad_norm": 0.9909681641946865, "learning_rate": 0.00018016348922055448, "loss": 2.3471, "step": 1489 }, { "epoch": 7.305043050430505, "grad_norm": 0.9002401210929803, "learning_rate": 0.00017955348433987473, "loss": 1.9602, "step": 1490 }, { "epoch": 7.3099630996309966, "grad_norm": 1.2585528536992074, "learning_rate": 0.00017894428781513368, "loss": 2.0168, "step": 1491 }, { "epoch": 7.3148831488314885, "grad_norm": 1.4623201356760576, "learning_rate": 0.000178335901183086, "loss": 2.2558, "step": 1492 }, { "epoch": 7.31980319803198, "grad_norm": 1.295511477419598, "learning_rate": 0.00017772832597844285, "loss": 2.0787, "step": 1493 }, { "epoch": 7.324723247232472, "grad_norm": 2.1136126320600312, "learning_rate": 0.00017712156373386913, "loss": 2.4155, "step": 1494 }, { "epoch": 7.329643296432964, "grad_norm": 1.0259102158932762, "learning_rate": 0.00017651561597997845, "loss": 2.2933, "step": 1495 }, { "epoch": 7.334563345633456, "grad_norm": 1.1849983372460589, "learning_rate": 0.00017591048424532975, "loss": 2.0625, "step": 1496 }, { "epoch": 7.339483394833948, "grad_norm": 1.739017894378105, "learning_rate": 0.00017530617005642428, "loss": 2.1194, "step": 1497 }, { "epoch": 7.34440344403444, "grad_norm": 0.8637599965719654, "learning_rate": 0.0001747026749377002, "loss": 2.4137, "step": 1498 }, { "epoch": 7.349323493234932, "grad_norm": 2.057682369812147, "learning_rate": 0.0001741000004115295, "loss": 2.2591, "step": 1499 }, { "epoch": 7.354243542435424, "grad_norm": 1.2213830469337867, "learning_rate": 0.0001734981479982146, "loss": 2.1298, "step": 1500 }, { "epoch": 7.359163591635916, "grad_norm": 1.6889001076899095, "learning_rate": 0.0001728971192159836, "loss": 1.939, "step": 1501 }, { "epoch": 7.364083640836409, "grad_norm": 0.8654962313406968, "learning_rate": 0.0001722969155809872, "loss": 1.9681, "step": 1502 }, { "epoch": 7.369003690036901, "grad_norm": 1.2324421198837354, "learning_rate": 0.0001716975386072947, "loss": 2.4584, "step": 1503 }, { "epoch": 7.373923739237393, "grad_norm": 2.1731441946252383, "learning_rate": 0.0001710989898068896, "loss": 2.145, "step": 1504 }, { "epoch": 7.378843788437885, "grad_norm": 0.6216859678295367, "learning_rate": 0.00017050127068966682, "loss": 2.0521, "step": 1505 }, { "epoch": 7.3837638376383765, "grad_norm": 0.7394465837324259, "learning_rate": 0.0001699043827634278, "loss": 2.0315, "step": 1506 }, { "epoch": 7.3886838868388685, "grad_norm": 0.8553620177685016, "learning_rate": 0.00016930832753387764, "loss": 1.9948, "step": 1507 }, { "epoch": 7.39360393603936, "grad_norm": 1.0946418198069783, "learning_rate": 0.00016871310650462058, "loss": 1.5568, "step": 1508 }, { "epoch": 7.398523985239852, "grad_norm": 1.1285762191011401, "learning_rate": 0.0001681187211771567, "loss": 2.1901, "step": 1509 }, { "epoch": 7.403444034440344, "grad_norm": 1.0038509818975356, "learning_rate": 0.00016752517305087812, "loss": 1.9365, "step": 1510 }, { "epoch": 7.408364083640836, "grad_norm": 0.9410490426594265, "learning_rate": 0.00016693246362306464, "loss": 1.8867, "step": 1511 }, { "epoch": 7.413284132841328, "grad_norm": 0.9731932645919288, "learning_rate": 0.00016634059438888033, "loss": 2.3505, "step": 1512 }, { "epoch": 7.41820418204182, "grad_norm": 2.0497796999434073, "learning_rate": 0.00016574956684137045, "loss": 1.9589, "step": 1513 }, { "epoch": 7.423124231242312, "grad_norm": 1.2059652756117036, "learning_rate": 0.00016515938247145613, "loss": 1.9331, "step": 1514 }, { "epoch": 7.428044280442805, "grad_norm": 0.9620261662100495, "learning_rate": 0.00016457004276793224, "loss": 2.1446, "step": 1515 }, { "epoch": 7.432964329643297, "grad_norm": 1.8668134265464367, "learning_rate": 0.00016398154921746272, "loss": 2.0908, "step": 1516 }, { "epoch": 7.437884378843789, "grad_norm": 0.7240057291935856, "learning_rate": 0.0001633939033045766, "loss": 2.005, "step": 1517 }, { "epoch": 7.442804428044281, "grad_norm": 1.0609162045456355, "learning_rate": 0.00016280710651166535, "loss": 1.7834, "step": 1518 }, { "epoch": 7.447724477244773, "grad_norm": 1.0048124524578408, "learning_rate": 0.0001622211603189777, "loss": 2.3108, "step": 1519 }, { "epoch": 7.4526445264452645, "grad_norm": 1.7501890048017468, "learning_rate": 0.0001616360662046173, "loss": 2.1364, "step": 1520 }, { "epoch": 7.4575645756457565, "grad_norm": 1.4377759139009383, "learning_rate": 0.0001610518256445382, "loss": 2.1895, "step": 1521 }, { "epoch": 7.462484624846248, "grad_norm": 2.262957402789069, "learning_rate": 0.00016046844011254085, "loss": 2.1514, "step": 1522 }, { "epoch": 7.46740467404674, "grad_norm": 0.8584274672026593, "learning_rate": 0.00015988591108026951, "loss": 2.0271, "step": 1523 }, { "epoch": 7.472324723247232, "grad_norm": 0.9224830080093815, "learning_rate": 0.00015930424001720729, "loss": 2.5238, "step": 1524 }, { "epoch": 7.477244772447724, "grad_norm": 1.2185664978169204, "learning_rate": 0.00015872342839067305, "loss": 1.6208, "step": 1525 }, { "epoch": 7.482164821648216, "grad_norm": 0.4278485323092009, "learning_rate": 0.0001581434776658179, "loss": 1.8562, "step": 1526 }, { "epoch": 7.487084870848708, "grad_norm": 0.7787287181038978, "learning_rate": 0.00015756438930562128, "loss": 2.3538, "step": 1527 }, { "epoch": 7.492004920049201, "grad_norm": 1.32579572345312, "learning_rate": 0.00015698616477088679, "loss": 2.0971, "step": 1528 }, { "epoch": 7.496924969249693, "grad_norm": 1.2817747994901296, "learning_rate": 0.00015640880552023957, "loss": 2.0634, "step": 1529 }, { "epoch": 7.501845018450185, "grad_norm": 0.6753121861611743, "learning_rate": 0.0001558323130101213, "loss": 2.0088, "step": 1530 }, { "epoch": 7.506765067650677, "grad_norm": 0.8845973519208334, "learning_rate": 0.00015525668869478788, "loss": 2.2966, "step": 1531 }, { "epoch": 7.511685116851169, "grad_norm": 0.7431827715725654, "learning_rate": 0.0001546819340263046, "loss": 2.147, "step": 1532 }, { "epoch": 7.516605166051661, "grad_norm": 1.6813440131034052, "learning_rate": 0.0001541080504545433, "loss": 2.2667, "step": 1533 }, { "epoch": 7.521525215252153, "grad_norm": 0.8574377610516449, "learning_rate": 0.00015353503942717845, "loss": 1.9577, "step": 1534 }, { "epoch": 7.5264452644526445, "grad_norm": 0.8648001807202453, "learning_rate": 0.00015296290238968303, "loss": 1.9622, "step": 1535 }, { "epoch": 7.531365313653136, "grad_norm": 1.1077403856879942, "learning_rate": 0.0001523916407853259, "loss": 1.449, "step": 1536 }, { "epoch": 7.536285362853628, "grad_norm": 1.766368053806959, "learning_rate": 0.00015182125605516707, "loss": 1.9798, "step": 1537 }, { "epoch": 7.54120541205412, "grad_norm": 1.135382537410408, "learning_rate": 0.00015125174963805427, "loss": 1.8061, "step": 1538 }, { "epoch": 7.546125461254612, "grad_norm": 0.8672525756053826, "learning_rate": 0.00015068312297062086, "loss": 2.1561, "step": 1539 }, { "epoch": 7.551045510455104, "grad_norm": 1.0103147029652069, "learning_rate": 0.00015011537748727972, "loss": 2.105, "step": 1540 }, { "epoch": 7.555965559655597, "grad_norm": 1.6170715246350007, "learning_rate": 0.00014954851462022116, "loss": 2.3411, "step": 1541 }, { "epoch": 7.560885608856088, "grad_norm": 0.9213969627853075, "learning_rate": 0.00014898253579940946, "loss": 2.1108, "step": 1542 }, { "epoch": 7.565805658056581, "grad_norm": 1.3461096200755767, "learning_rate": 0.0001484174424525781, "loss": 1.9683, "step": 1543 }, { "epoch": 7.570725707257073, "grad_norm": 1.3648393750946384, "learning_rate": 0.00014785323600522744, "loss": 1.9156, "step": 1544 }, { "epoch": 7.575645756457565, "grad_norm": 0.9615283793826174, "learning_rate": 0.00014728991788062051, "loss": 2.2178, "step": 1545 }, { "epoch": 7.580565805658057, "grad_norm": 1.460657149633074, "learning_rate": 0.00014672748949977904, "loss": 2.0856, "step": 1546 }, { "epoch": 7.585485854858549, "grad_norm": 1.5566026577828502, "learning_rate": 0.00014616595228148093, "loss": 2.3678, "step": 1547 }, { "epoch": 7.590405904059041, "grad_norm": 1.4668991383253995, "learning_rate": 0.00014560530764225527, "loss": 2.0078, "step": 1548 }, { "epoch": 7.5953259532595325, "grad_norm": 1.5392447437231267, "learning_rate": 0.00014504555699638034, "loss": 2.0965, "step": 1549 }, { "epoch": 7.6002460024600245, "grad_norm": 1.3992100511624816, "learning_rate": 0.00014448670175587846, "loss": 2.2653, "step": 1550 }, { "epoch": 7.605166051660516, "grad_norm": 1.0131167263862717, "learning_rate": 0.00014392874333051388, "loss": 2.1786, "step": 1551 }, { "epoch": 7.610086100861008, "grad_norm": 1.0127998442341626, "learning_rate": 0.0001433716831277883, "loss": 1.6635, "step": 1552 }, { "epoch": 7.6150061500615, "grad_norm": 1.648192019176012, "learning_rate": 0.00014281552255293738, "loss": 2.3213, "step": 1553 }, { "epoch": 7.619926199261993, "grad_norm": 1.2966770892390989, "learning_rate": 0.00014226026300892774, "loss": 1.8294, "step": 1554 }, { "epoch": 7.624846248462484, "grad_norm": 2.0010937684623005, "learning_rate": 0.00014170590589645271, "loss": 1.7362, "step": 1555 }, { "epoch": 7.629766297662977, "grad_norm": 0.8715937645791298, "learning_rate": 0.0001411524526139291, "loss": 2.6154, "step": 1556 }, { "epoch": 7.634686346863469, "grad_norm": 2.9688612954620535, "learning_rate": 0.0001405999045574945, "loss": 2.0415, "step": 1557 }, { "epoch": 7.639606396063961, "grad_norm": 2.1620416771131437, "learning_rate": 0.00014004826312100216, "loss": 2.1113, "step": 1558 }, { "epoch": 7.644526445264453, "grad_norm": 1.3618201419591998, "learning_rate": 0.00013949752969601837, "loss": 1.9737, "step": 1559 }, { "epoch": 7.649446494464945, "grad_norm": 1.0218155059110938, "learning_rate": 0.00013894770567181937, "loss": 1.9361, "step": 1560 }, { "epoch": 7.654366543665437, "grad_norm": 1.0642360040866508, "learning_rate": 0.00013839879243538679, "loss": 2.3231, "step": 1561 }, { "epoch": 7.659286592865929, "grad_norm": 0.8170854189716349, "learning_rate": 0.0001378507913714051, "loss": 1.6561, "step": 1562 }, { "epoch": 7.6642066420664205, "grad_norm": 0.8655889717521326, "learning_rate": 0.00013730370386225775, "loss": 2.0147, "step": 1563 }, { "epoch": 7.6691266912669125, "grad_norm": 1.161950386588091, "learning_rate": 0.00013675753128802325, "loss": 2.1919, "step": 1564 }, { "epoch": 7.674046740467404, "grad_norm": 3.033221844772658, "learning_rate": 0.0001362122750264727, "loss": 1.9284, "step": 1565 }, { "epoch": 7.678966789667896, "grad_norm": 0.809821084658163, "learning_rate": 0.000135667936453065, "loss": 2.1587, "step": 1566 }, { "epoch": 7.683886838868388, "grad_norm": 2.209310891226035, "learning_rate": 0.00013512451694094491, "loss": 2.1578, "step": 1567 }, { "epoch": 7.68880688806888, "grad_norm": 1.4223882481081784, "learning_rate": 0.00013458201786093794, "loss": 1.8868, "step": 1568 }, { "epoch": 7.693726937269373, "grad_norm": 0.9921086780783321, "learning_rate": 0.00013404044058154836, "loss": 2.3921, "step": 1569 }, { "epoch": 7.698646986469865, "grad_norm": 2.5240597021398394, "learning_rate": 0.000133499786468955, "loss": 2.1694, "step": 1570 }, { "epoch": 7.703567035670357, "grad_norm": 1.1996038807636755, "learning_rate": 0.00013296005688700764, "loss": 1.9453, "step": 1571 }, { "epoch": 7.708487084870849, "grad_norm": 1.1832099452280007, "learning_rate": 0.00013242125319722387, "loss": 2.0944, "step": 1572 }, { "epoch": 7.713407134071341, "grad_norm": 0.7796580797405138, "learning_rate": 0.0001318833767587861, "loss": 1.8283, "step": 1573 }, { "epoch": 7.718327183271833, "grad_norm": 0.9367650469847337, "learning_rate": 0.00013134642892853704, "loss": 2.0063, "step": 1574 }, { "epoch": 7.723247232472325, "grad_norm": 1.2582980166517985, "learning_rate": 0.0001308104110609773, "loss": 2.0157, "step": 1575 }, { "epoch": 7.728167281672817, "grad_norm": 1.0759301595570816, "learning_rate": 0.00013027532450826163, "loss": 2.2074, "step": 1576 }, { "epoch": 7.733087330873309, "grad_norm": 1.5292410588279852, "learning_rate": 0.00012974117062019503, "loss": 2.1733, "step": 1577 }, { "epoch": 7.7380073800738005, "grad_norm": 0.9214782646863559, "learning_rate": 0.0001292079507442303, "loss": 1.7848, "step": 1578 }, { "epoch": 7.7429274292742925, "grad_norm": 0.9887768093041205, "learning_rate": 0.00012867566622546357, "loss": 2.2146, "step": 1579 }, { "epoch": 7.747847478474784, "grad_norm": 0.7792492647634259, "learning_rate": 0.00012814431840663177, "loss": 1.7174, "step": 1580 }, { "epoch": 7.752767527675276, "grad_norm": 0.7966681544414332, "learning_rate": 0.00012761390862810906, "loss": 2.0397, "step": 1581 }, { "epoch": 7.757687576875769, "grad_norm": 0.7615309507140418, "learning_rate": 0.00012708443822790288, "loss": 2.4798, "step": 1582 }, { "epoch": 7.762607626076261, "grad_norm": 1.0114053035603972, "learning_rate": 0.00012655590854165145, "loss": 1.8751, "step": 1583 }, { "epoch": 7.767527675276753, "grad_norm": 0.8745802418102664, "learning_rate": 0.00012602832090261956, "loss": 1.8139, "step": 1584 }, { "epoch": 7.772447724477245, "grad_norm": 0.5697230591753268, "learning_rate": 0.00012550167664169564, "loss": 2.1325, "step": 1585 }, { "epoch": 7.777367773677737, "grad_norm": 2.31382779707015, "learning_rate": 0.00012497597708738866, "loss": 1.6714, "step": 1586 }, { "epoch": 7.782287822878229, "grad_norm": 0.8718744870995244, "learning_rate": 0.0001244512235658245, "loss": 1.8319, "step": 1587 }, { "epoch": 7.787207872078721, "grad_norm": 0.8389117150350871, "learning_rate": 0.00012392741740074198, "loss": 1.7185, "step": 1588 }, { "epoch": 7.792127921279213, "grad_norm": 1.0050232334778513, "learning_rate": 0.00012340455991349092, "loss": 1.5341, "step": 1589 }, { "epoch": 7.797047970479705, "grad_norm": 1.2193191500842275, "learning_rate": 0.00012288265242302733, "loss": 1.7552, "step": 1590 }, { "epoch": 7.801968019680197, "grad_norm": 1.6314048401300694, "learning_rate": 0.00012236169624591138, "loss": 2.0869, "step": 1591 }, { "epoch": 7.8068880688806885, "grad_norm": 2.365128692700126, "learning_rate": 0.0001218416926963028, "loss": 2.2505, "step": 1592 }, { "epoch": 7.8118081180811805, "grad_norm": 1.079945177604901, "learning_rate": 0.00012132264308595875, "loss": 1.5804, "step": 1593 }, { "epoch": 7.816728167281672, "grad_norm": 1.1561758865677154, "learning_rate": 0.00012080454872422997, "loss": 1.8045, "step": 1594 }, { "epoch": 7.821648216482165, "grad_norm": 1.570652089089571, "learning_rate": 0.00012028741091805712, "loss": 2.0055, "step": 1595 }, { "epoch": 7.826568265682657, "grad_norm": 1.0533149909059767, "learning_rate": 0.00011977123097196829, "loss": 2.1057, "step": 1596 }, { "epoch": 7.831488314883149, "grad_norm": 1.3853021883342915, "learning_rate": 0.00011925601018807496, "loss": 2.2546, "step": 1597 }, { "epoch": 7.836408364083641, "grad_norm": 1.1968965719551778, "learning_rate": 0.00011874174986606889, "loss": 2.314, "step": 1598 }, { "epoch": 7.841328413284133, "grad_norm": 1.7856596470276604, "learning_rate": 0.00011822845130321979, "loss": 1.8237, "step": 1599 }, { "epoch": 7.846248462484625, "grad_norm": 0.8792628490648093, "learning_rate": 0.00011771611579437047, "loss": 1.8044, "step": 1600 }, { "epoch": 7.851168511685117, "grad_norm": 1.2167460385124351, "learning_rate": 0.00011720474463193443, "loss": 2.0165, "step": 1601 }, { "epoch": 7.856088560885609, "grad_norm": 0.8711363930099677, "learning_rate": 0.00011669433910589289, "loss": 1.7634, "step": 1602 }, { "epoch": 7.861008610086101, "grad_norm": 1.3046873069231946, "learning_rate": 0.00011618490050379071, "loss": 1.7571, "step": 1603 }, { "epoch": 7.865928659286593, "grad_norm": 1.5425159779991002, "learning_rate": 0.00011567643011073392, "loss": 2.0673, "step": 1604 }, { "epoch": 7.870848708487085, "grad_norm": 0.9720480279138133, "learning_rate": 0.00011516892920938627, "loss": 1.8113, "step": 1605 }, { "epoch": 7.875768757687577, "grad_norm": 1.0401060755185765, "learning_rate": 0.00011466239907996534, "loss": 1.7218, "step": 1606 }, { "epoch": 7.8806888068880685, "grad_norm": 1.4843173731270105, "learning_rate": 0.00011415684100024043, "loss": 1.5715, "step": 1607 }, { "epoch": 7.885608856088561, "grad_norm": 0.6446846420511566, "learning_rate": 0.00011365225624552827, "loss": 1.7092, "step": 1608 }, { "epoch": 7.890528905289053, "grad_norm": 1.6828525108599426, "learning_rate": 0.0001131486460886908, "loss": 2.2085, "step": 1609 }, { "epoch": 7.895448954489545, "grad_norm": 1.6995427949818815, "learning_rate": 0.0001126460118001309, "loss": 1.6362, "step": 1610 }, { "epoch": 7.900369003690037, "grad_norm": 0.9557127391364807, "learning_rate": 0.00011214435464779005, "loss": 2.0055, "step": 1611 }, { "epoch": 7.905289052890529, "grad_norm": 1.789225869890806, "learning_rate": 0.00011164367589714496, "loss": 1.9007, "step": 1612 }, { "epoch": 7.910209102091021, "grad_norm": 3.4179286494484145, "learning_rate": 0.00011114397681120387, "loss": 1.8195, "step": 1613 }, { "epoch": 7.915129151291513, "grad_norm": 1.1607809919109142, "learning_rate": 0.00011064525865050379, "loss": 1.729, "step": 1614 }, { "epoch": 7.920049200492005, "grad_norm": 1.5917885607303222, "learning_rate": 0.00011014752267310757, "loss": 1.8057, "step": 1615 }, { "epoch": 7.924969249692497, "grad_norm": 1.4913505081526146, "learning_rate": 0.00010965077013459995, "loss": 1.6642, "step": 1616 }, { "epoch": 7.929889298892989, "grad_norm": 0.6293753982953719, "learning_rate": 0.00010915500228808523, "loss": 1.6374, "step": 1617 }, { "epoch": 7.934809348093481, "grad_norm": 1.6993512455632334, "learning_rate": 0.00010866022038418377, "loss": 1.8317, "step": 1618 }, { "epoch": 7.939729397293973, "grad_norm": 1.5410628320998783, "learning_rate": 0.0001081664256710283, "loss": 1.5836, "step": 1619 }, { "epoch": 7.944649446494465, "grad_norm": 1.3528559655866592, "learning_rate": 0.00010767361939426195, "loss": 2.3677, "step": 1620 }, { "epoch": 7.949569495694957, "grad_norm": 2.4019918953691595, "learning_rate": 0.00010718180279703371, "loss": 1.6864, "step": 1621 }, { "epoch": 7.9544895448954485, "grad_norm": 1.53815620989729, "learning_rate": 0.00010669097711999659, "loss": 2.2325, "step": 1622 }, { "epoch": 7.959409594095941, "grad_norm": 0.943158282791787, "learning_rate": 0.00010620114360130384, "loss": 1.7194, "step": 1623 }, { "epoch": 7.964329643296433, "grad_norm": 1.0967559273561973, "learning_rate": 0.00010571230347660542, "loss": 1.6436, "step": 1624 }, { "epoch": 7.969249692496925, "grad_norm": 1.0096982641595444, "learning_rate": 0.00010522445797904607, "loss": 2.0846, "step": 1625 }, { "epoch": 7.974169741697417, "grad_norm": 1.4425505101274332, "learning_rate": 0.00010473760833926088, "loss": 1.7836, "step": 1626 }, { "epoch": 7.979089790897909, "grad_norm": 1.244273829771564, "learning_rate": 0.000104251755785373, "loss": 2.092, "step": 1627 }, { "epoch": 7.984009840098401, "grad_norm": 1.8571699637829515, "learning_rate": 0.00010376690154299046, "loss": 1.6677, "step": 1628 }, { "epoch": 7.988929889298893, "grad_norm": 1.1641187040515832, "learning_rate": 0.00010328304683520307, "loss": 1.7984, "step": 1629 }, { "epoch": 7.993849938499385, "grad_norm": 1.2452739516449407, "learning_rate": 0.00010280019288257869, "loss": 2.0078, "step": 1630 }, { "epoch": 7.998769987699877, "grad_norm": 0.9195738941559858, "learning_rate": 0.00010231834090316134, "loss": 1.9211, "step": 1631 }, { "epoch": 8.0, "grad_norm": 1.6970151546700254, "learning_rate": 0.0001018374921124669, "loss": 0.5646, "step": 1632 }, { "epoch": 8.004920049200493, "grad_norm": 1.9716256097208713, "learning_rate": 0.00010135764772348105, "loss": 1.9609, "step": 1633 }, { "epoch": 8.009840098400984, "grad_norm": 1.1252258782204925, "learning_rate": 0.00010087880894665541, "loss": 2.029, "step": 1634 }, { "epoch": 8.014760147601477, "grad_norm": 1.25933545065632, "learning_rate": 0.00010040097698990508, "loss": 1.8884, "step": 1635 }, { "epoch": 8.019680196801968, "grad_norm": 0.9103152309369574, "learning_rate": 9.992415305860553e-05, "loss": 1.7325, "step": 1636 }, { "epoch": 8.02460024600246, "grad_norm": 1.168593000316198, "learning_rate": 9.944833835558887e-05, "loss": 1.9788, "step": 1637 }, { "epoch": 8.029520295202952, "grad_norm": 1.0302686632460736, "learning_rate": 9.89735340811419e-05, "loss": 1.7749, "step": 1638 }, { "epoch": 8.034440344403444, "grad_norm": 2.665969735411658, "learning_rate": 9.849974143300217e-05, "loss": 2.0107, "step": 1639 }, { "epoch": 8.039360393603936, "grad_norm": 1.3430534659233482, "learning_rate": 9.80269616063551e-05, "loss": 2.0784, "step": 1640 }, { "epoch": 8.044280442804428, "grad_norm": 1.665887890148232, "learning_rate": 9.755519579383204e-05, "loss": 1.6471, "step": 1641 }, { "epoch": 8.04920049200492, "grad_norm": 1.8239921023014831, "learning_rate": 9.708444518550552e-05, "loss": 2.0223, "step": 1642 }, { "epoch": 8.054120541205412, "grad_norm": 1.0527050934113975, "learning_rate": 9.661471096888735e-05, "loss": 2.0371, "step": 1643 }, { "epoch": 8.059040590405903, "grad_norm": 1.1263576860723037, "learning_rate": 9.614599432892574e-05, "loss": 2.0625, "step": 1644 }, { "epoch": 8.063960639606396, "grad_norm": 1.6812987724599906, "learning_rate": 9.567829644800142e-05, "loss": 1.6578, "step": 1645 }, { "epoch": 8.068880688806889, "grad_norm": 0.7357062517336761, "learning_rate": 9.52116185059258e-05, "loss": 1.8387, "step": 1646 }, { "epoch": 8.07380073800738, "grad_norm": 1.5050562688043854, "learning_rate": 9.474596167993687e-05, "loss": 1.9796, "step": 1647 }, { "epoch": 8.078720787207873, "grad_norm": 1.1909961101665127, "learning_rate": 9.428132714469701e-05, "loss": 1.6893, "step": 1648 }, { "epoch": 8.083640836408364, "grad_norm": 0.9984971864366412, "learning_rate": 9.381771607229e-05, "loss": 1.7975, "step": 1649 }, { "epoch": 8.088560885608857, "grad_norm": 1.1989343719323666, "learning_rate": 9.335512963221732e-05, "loss": 1.6891, "step": 1650 }, { "epoch": 8.093480934809348, "grad_norm": 0.7608298026415856, "learning_rate": 9.289356899139623e-05, "loss": 2.28, "step": 1651 }, { "epoch": 8.09840098400984, "grad_norm": 0.7189087708081897, "learning_rate": 9.243303531415592e-05, "loss": 1.8573, "step": 1652 }, { "epoch": 8.103321033210332, "grad_norm": 1.6498992383314666, "learning_rate": 9.197352976223494e-05, "loss": 1.6515, "step": 1653 }, { "epoch": 8.108241082410824, "grad_norm": 1.1535215506145733, "learning_rate": 9.151505349477901e-05, "loss": 1.9031, "step": 1654 }, { "epoch": 8.113161131611315, "grad_norm": 1.582582772462898, "learning_rate": 9.10576076683366e-05, "loss": 1.5619, "step": 1655 }, { "epoch": 8.118081180811808, "grad_norm": 0.7432971163442113, "learning_rate": 9.060119343685686e-05, "loss": 1.7926, "step": 1656 }, { "epoch": 8.1230012300123, "grad_norm": 0.9082181139431695, "learning_rate": 9.014581195168725e-05, "loss": 2.0248, "step": 1657 }, { "epoch": 8.127921279212792, "grad_norm": 0.9077136295635455, "learning_rate": 8.969146436156928e-05, "loss": 2.1211, "step": 1658 }, { "epoch": 8.132841328413285, "grad_norm": 1.440697552778437, "learning_rate": 8.923815181263683e-05, "loss": 1.9459, "step": 1659 }, { "epoch": 8.137761377613776, "grad_norm": 1.035538306443837, "learning_rate": 8.878587544841294e-05, "loss": 1.692, "step": 1660 }, { "epoch": 8.142681426814269, "grad_norm": 0.7669898742404221, "learning_rate": 8.83346364098061e-05, "loss": 1.7834, "step": 1661 }, { "epoch": 8.14760147601476, "grad_norm": 1.0931092445602204, "learning_rate": 8.788443583510886e-05, "loss": 1.8596, "step": 1662 }, { "epoch": 8.152521525215253, "grad_norm": 1.0418112205824783, "learning_rate": 8.743527485999341e-05, "loss": 2.0712, "step": 1663 }, { "epoch": 8.157441574415744, "grad_norm": 1.4305006561050262, "learning_rate": 8.698715461751005e-05, "loss": 2.0076, "step": 1664 }, { "epoch": 8.162361623616237, "grad_norm": 1.0410596244355548, "learning_rate": 8.654007623808336e-05, "loss": 1.8524, "step": 1665 }, { "epoch": 8.167281672816728, "grad_norm": 1.059929798513132, "learning_rate": 8.60940408495099e-05, "loss": 1.674, "step": 1666 }, { "epoch": 8.17220172201722, "grad_norm": 2.231746241921199, "learning_rate": 8.564904957695525e-05, "loss": 1.6973, "step": 1667 }, { "epoch": 8.177121771217712, "grad_norm": 1.179970632895159, "learning_rate": 8.520510354295086e-05, "loss": 1.7984, "step": 1668 }, { "epoch": 8.182041820418204, "grad_norm": 1.3637204443499145, "learning_rate": 8.476220386739153e-05, "loss": 2.2158, "step": 1669 }, { "epoch": 8.186961869618695, "grad_norm": 2.6487386915348368, "learning_rate": 8.432035166753288e-05, "loss": 2.0688, "step": 1670 }, { "epoch": 8.191881918819188, "grad_norm": 1.4454447385980342, "learning_rate": 8.387954805798748e-05, "loss": 1.9019, "step": 1671 }, { "epoch": 8.196801968019681, "grad_norm": 1.1588977128122933, "learning_rate": 8.343979415072372e-05, "loss": 2.2515, "step": 1672 }, { "epoch": 8.201722017220172, "grad_norm": 1.223244217870728, "learning_rate": 8.30010910550611e-05, "loss": 2.0215, "step": 1673 }, { "epoch": 8.206642066420665, "grad_norm": 1.9368191571646276, "learning_rate": 8.256343987766868e-05, "loss": 2.097, "step": 1674 }, { "epoch": 8.211562115621156, "grad_norm": 0.8586573566103859, "learning_rate": 8.212684172256218e-05, "loss": 1.9938, "step": 1675 }, { "epoch": 8.216482164821649, "grad_norm": 2.47140301355081, "learning_rate": 8.169129769110056e-05, "loss": 1.8849, "step": 1676 }, { "epoch": 8.22140221402214, "grad_norm": 1.5234130465239588, "learning_rate": 8.125680888198395e-05, "loss": 1.7238, "step": 1677 }, { "epoch": 8.226322263222633, "grad_norm": 1.1086133144199988, "learning_rate": 8.082337639125071e-05, "loss": 1.8811, "step": 1678 }, { "epoch": 8.231242312423124, "grad_norm": 2.1804518758752036, "learning_rate": 8.0391001312274e-05, "loss": 2.0721, "step": 1679 }, { "epoch": 8.236162361623617, "grad_norm": 2.5265595165377377, "learning_rate": 7.995968473576026e-05, "loss": 2.1367, "step": 1680 }, { "epoch": 8.241082410824108, "grad_norm": 1.0749076210129824, "learning_rate": 7.95294277497452e-05, "loss": 1.4489, "step": 1681 }, { "epoch": 8.2460024600246, "grad_norm": 1.730815982578132, "learning_rate": 7.910023143959166e-05, "loss": 2.3503, "step": 1682 }, { "epoch": 8.250922509225092, "grad_norm": 1.971502447494671, "learning_rate": 7.867209688798721e-05, "loss": 2.2064, "step": 1683 }, { "epoch": 8.255842558425584, "grad_norm": 1.0085993885058189, "learning_rate": 7.82450251749407e-05, "loss": 1.7718, "step": 1684 }, { "epoch": 8.260762607626077, "grad_norm": 2.207654587580007, "learning_rate": 7.781901737778013e-05, "loss": 2.0993, "step": 1685 }, { "epoch": 8.265682656826568, "grad_norm": 2.0876777109048033, "learning_rate": 7.739407457114949e-05, "loss": 1.7904, "step": 1686 }, { "epoch": 8.270602706027061, "grad_norm": 0.9678812256160029, "learning_rate": 7.697019782700604e-05, "loss": 2.0612, "step": 1687 }, { "epoch": 8.275522755227552, "grad_norm": 3.767165814467238, "learning_rate": 7.654738821461826e-05, "loss": 1.7792, "step": 1688 }, { "epoch": 8.280442804428045, "grad_norm": 1.156816244763411, "learning_rate": 7.612564680056234e-05, "loss": 1.8427, "step": 1689 }, { "epoch": 8.285362853628536, "grad_norm": 1.4167354566724422, "learning_rate": 7.570497464871989e-05, "loss": 2.1597, "step": 1690 }, { "epoch": 8.290282902829029, "grad_norm": 1.6245368568218608, "learning_rate": 7.528537282027558e-05, "loss": 1.8998, "step": 1691 }, { "epoch": 8.29520295202952, "grad_norm": 1.9757620917165226, "learning_rate": 7.486684237371344e-05, "loss": 1.703, "step": 1692 }, { "epoch": 8.300123001230013, "grad_norm": 1.8951925973520294, "learning_rate": 7.444938436481547e-05, "loss": 2.1799, "step": 1693 }, { "epoch": 8.305043050430504, "grad_norm": 1.1991049967799958, "learning_rate": 7.40329998466579e-05, "loss": 1.635, "step": 1694 }, { "epoch": 8.309963099630997, "grad_norm": 0.7413314042839501, "learning_rate": 7.361768986960893e-05, "loss": 1.8985, "step": 1695 }, { "epoch": 8.314883148831488, "grad_norm": 0.8667549448981532, "learning_rate": 7.320345548132679e-05, "loss": 1.9775, "step": 1696 }, { "epoch": 8.31980319803198, "grad_norm": 1.207120539590834, "learning_rate": 7.279029772675571e-05, "loss": 1.8056, "step": 1697 }, { "epoch": 8.324723247232471, "grad_norm": 1.4550328311070513, "learning_rate": 7.237821764812441e-05, "loss": 1.9976, "step": 1698 }, { "epoch": 8.329643296432964, "grad_norm": 1.0425434678513608, "learning_rate": 7.196721628494296e-05, "loss": 2.058, "step": 1699 }, { "epoch": 8.334563345633457, "grad_norm": 1.0720522092039801, "learning_rate": 7.155729467400013e-05, "loss": 1.6406, "step": 1700 }, { "epoch": 8.339483394833948, "grad_norm": 1.1974729383685745, "learning_rate": 7.114845384936108e-05, "loss": 1.6616, "step": 1701 }, { "epoch": 8.344403444034441, "grad_norm": 1.3905689617417774, "learning_rate": 7.074069484236479e-05, "loss": 1.9211, "step": 1702 }, { "epoch": 8.349323493234932, "grad_norm": 0.6638980852152053, "learning_rate": 7.03340186816207e-05, "loss": 1.7571, "step": 1703 }, { "epoch": 8.354243542435425, "grad_norm": 1.1986802714482576, "learning_rate": 6.992842639300723e-05, "loss": 1.7925, "step": 1704 }, { "epoch": 8.359163591635916, "grad_norm": 1.5224191618529657, "learning_rate": 6.952391899966825e-05, "loss": 1.6891, "step": 1705 }, { "epoch": 8.364083640836409, "grad_norm": 1.0931951368634478, "learning_rate": 6.912049752201117e-05, "loss": 1.6773, "step": 1706 }, { "epoch": 8.3690036900369, "grad_norm": 1.508176066177314, "learning_rate": 6.871816297770378e-05, "loss": 1.9068, "step": 1707 }, { "epoch": 8.373923739237393, "grad_norm": 1.770361157454554, "learning_rate": 6.83169163816722e-05, "loss": 1.8869, "step": 1708 }, { "epoch": 8.378843788437884, "grad_norm": 1.427136128616945, "learning_rate": 6.791675874609816e-05, "loss": 1.7542, "step": 1709 }, { "epoch": 8.383763837638377, "grad_norm": 1.5319572243428101, "learning_rate": 6.7517691080416e-05, "loss": 1.9544, "step": 1710 }, { "epoch": 8.388683886838868, "grad_norm": 1.2685640046655662, "learning_rate": 6.711971439131109e-05, "loss": 1.7545, "step": 1711 }, { "epoch": 8.39360393603936, "grad_norm": 0.7893040362970407, "learning_rate": 6.67228296827162e-05, "loss": 1.6908, "step": 1712 }, { "epoch": 8.398523985239853, "grad_norm": 1.6628655765560478, "learning_rate": 6.632703795580947e-05, "loss": 2.2494, "step": 1713 }, { "epoch": 8.403444034440344, "grad_norm": 1.839879649894436, "learning_rate": 6.593234020901256e-05, "loss": 2.0261, "step": 1714 }, { "epoch": 8.408364083640837, "grad_norm": 1.8514143076734184, "learning_rate": 6.553873743798677e-05, "loss": 1.7826, "step": 1715 }, { "epoch": 8.413284132841328, "grad_norm": 1.0950124311689946, "learning_rate": 6.514623063563135e-05, "loss": 1.7125, "step": 1716 }, { "epoch": 8.418204182041821, "grad_norm": 0.9256437674619962, "learning_rate": 6.475482079208111e-05, "loss": 1.9388, "step": 1717 }, { "epoch": 8.423124231242312, "grad_norm": 0.977890039249454, "learning_rate": 6.43645088947034e-05, "loss": 1.7175, "step": 1718 }, { "epoch": 8.428044280442805, "grad_norm": 1.9602584838573636, "learning_rate": 6.397529592809614e-05, "loss": 1.8957, "step": 1719 }, { "epoch": 8.432964329643296, "grad_norm": 1.1299100452304705, "learning_rate": 6.35871828740851e-05, "loss": 1.6617, "step": 1720 }, { "epoch": 8.437884378843789, "grad_norm": 1.0783259180863698, "learning_rate": 6.320017071172113e-05, "loss": 1.8818, "step": 1721 }, { "epoch": 8.44280442804428, "grad_norm": 1.481552565934371, "learning_rate": 6.281426041727828e-05, "loss": 2.1034, "step": 1722 }, { "epoch": 8.447724477244773, "grad_norm": 0.962813890743116, "learning_rate": 6.242945296425074e-05, "loss": 2.0922, "step": 1723 }, { "epoch": 8.452644526445264, "grad_norm": 1.0727020378411807, "learning_rate": 6.204574932335111e-05, "loss": 1.4819, "step": 1724 }, { "epoch": 8.457564575645756, "grad_norm": 1.5506249427045493, "learning_rate": 6.166315046250703e-05, "loss": 1.6366, "step": 1725 }, { "epoch": 8.46248462484625, "grad_norm": 0.7667051467619164, "learning_rate": 6.128165734685964e-05, "loss": 1.5776, "step": 1726 }, { "epoch": 8.46740467404674, "grad_norm": 0.9018426354534257, "learning_rate": 6.0901270938760575e-05, "loss": 1.7157, "step": 1727 }, { "epoch": 8.472324723247233, "grad_norm": 1.499143249869518, "learning_rate": 6.05219921977696e-05, "loss": 1.8012, "step": 1728 }, { "epoch": 8.477244772447724, "grad_norm": 1.3177309026687296, "learning_rate": 6.0143822080652334e-05, "loss": 1.7375, "step": 1729 }, { "epoch": 8.482164821648217, "grad_norm": 0.6758510852885528, "learning_rate": 5.9766761541378e-05, "loss": 1.6373, "step": 1730 }, { "epoch": 8.487084870848708, "grad_norm": 1.7357103840784198, "learning_rate": 5.9390811531116476e-05, "loss": 1.7053, "step": 1731 }, { "epoch": 8.492004920049201, "grad_norm": 1.0369070644890863, "learning_rate": 5.9015972998236415e-05, "loss": 1.8114, "step": 1732 }, { "epoch": 8.496924969249692, "grad_norm": 1.0018026364956503, "learning_rate": 5.864224688830283e-05, "loss": 1.7475, "step": 1733 }, { "epoch": 8.501845018450185, "grad_norm": 0.9604742934006754, "learning_rate": 5.82696341440741e-05, "loss": 1.8009, "step": 1734 }, { "epoch": 8.506765067650676, "grad_norm": 1.4714393742793, "learning_rate": 5.789813570550051e-05, "loss": 1.7107, "step": 1735 }, { "epoch": 8.511685116851169, "grad_norm": 0.9817691894986847, "learning_rate": 5.752775250972098e-05, "loss": 1.9264, "step": 1736 }, { "epoch": 8.51660516605166, "grad_norm": 1.0701439324764142, "learning_rate": 5.715848549106145e-05, "loss": 1.8103, "step": 1737 }, { "epoch": 8.521525215252153, "grad_norm": 0.6941593246848597, "learning_rate": 5.679033558103219e-05, "loss": 1.7769, "step": 1738 }, { "epoch": 8.526445264452644, "grad_norm": 1.0301783752085936, "learning_rate": 5.642330370832521e-05, "loss": 1.6676, "step": 1739 }, { "epoch": 8.531365313653136, "grad_norm": 1.5037120475683994, "learning_rate": 5.6057390798812394e-05, "loss": 1.838, "step": 1740 }, { "epoch": 8.53628536285363, "grad_norm": 1.2151777740512535, "learning_rate": 5.5692597775542866e-05, "loss": 1.7846, "step": 1741 }, { "epoch": 8.54120541205412, "grad_norm": 1.5151555054626442, "learning_rate": 5.532892555874058e-05, "loss": 1.8479, "step": 1742 }, { "epoch": 8.546125461254613, "grad_norm": 1.1542698903671704, "learning_rate": 5.496637506580243e-05, "loss": 1.7875, "step": 1743 }, { "epoch": 8.551045510455104, "grad_norm": 1.5523677980525525, "learning_rate": 5.460494721129555e-05, "loss": 1.5023, "step": 1744 }, { "epoch": 8.555965559655597, "grad_norm": 1.2920801030002422, "learning_rate": 5.4244642906954966e-05, "loss": 2.0834, "step": 1745 }, { "epoch": 8.560885608856088, "grad_norm": 0.8048974492090425, "learning_rate": 5.3885463061681796e-05, "loss": 1.5585, "step": 1746 }, { "epoch": 8.565805658056581, "grad_norm": 1.448569993825552, "learning_rate": 5.352740858154009e-05, "loss": 1.5306, "step": 1747 }, { "epoch": 8.570725707257072, "grad_norm": 1.1704845751261599, "learning_rate": 5.317048036975558e-05, "loss": 2.2945, "step": 1748 }, { "epoch": 8.575645756457565, "grad_norm": 1.4199311874570935, "learning_rate": 5.2814679326712524e-05, "loss": 1.7296, "step": 1749 }, { "epoch": 8.580565805658056, "grad_norm": 1.4001123343141912, "learning_rate": 5.246000634995196e-05, "loss": 1.9145, "step": 1750 }, { "epoch": 8.585485854858549, "grad_norm": 0.7631058494394942, "learning_rate": 5.210646233416933e-05, "loss": 1.4111, "step": 1751 }, { "epoch": 8.59040590405904, "grad_norm": 1.4356545940990248, "learning_rate": 5.175404817121188e-05, "loss": 1.6516, "step": 1752 }, { "epoch": 8.595325953259533, "grad_norm": 0.7100889729719874, "learning_rate": 5.14027647500771e-05, "loss": 1.8876, "step": 1753 }, { "epoch": 8.600246002460025, "grad_norm": 0.815180966184006, "learning_rate": 5.105261295690977e-05, "loss": 1.6949, "step": 1754 }, { "epoch": 8.605166051660516, "grad_norm": 0.896784779880597, "learning_rate": 5.070359367499994e-05, "loss": 1.5119, "step": 1755 }, { "epoch": 8.61008610086101, "grad_norm": 1.5726230553964238, "learning_rate": 5.0355707784781435e-05, "loss": 1.8786, "step": 1756 }, { "epoch": 8.6150061500615, "grad_norm": 0.8699811989017665, "learning_rate": 5.0008956163828276e-05, "loss": 1.9983, "step": 1757 }, { "epoch": 8.619926199261993, "grad_norm": 0.9612736232047608, "learning_rate": 4.966333968685338e-05, "loss": 1.5626, "step": 1758 }, { "epoch": 8.624846248462484, "grad_norm": 1.2138144561679765, "learning_rate": 4.9318859225706444e-05, "loss": 1.7847, "step": 1759 }, { "epoch": 8.629766297662977, "grad_norm": 2.267953578348376, "learning_rate": 4.8975515649371026e-05, "loss": 2.0954, "step": 1760 }, { "epoch": 8.634686346863468, "grad_norm": 1.178851092394936, "learning_rate": 4.863330982396319e-05, "loss": 1.9567, "step": 1761 }, { "epoch": 8.63960639606396, "grad_norm": 1.1774452670364215, "learning_rate": 4.82922426127288e-05, "loss": 1.6484, "step": 1762 }, { "epoch": 8.644526445264452, "grad_norm": 1.304484191959281, "learning_rate": 4.7952314876041234e-05, "loss": 1.9006, "step": 1763 }, { "epoch": 8.649446494464945, "grad_norm": 1.209015673963668, "learning_rate": 4.761352747139974e-05, "loss": 1.7884, "step": 1764 }, { "epoch": 8.654366543665436, "grad_norm": 2.1448318743998596, "learning_rate": 4.727588125342669e-05, "loss": 2.0069, "step": 1765 }, { "epoch": 8.659286592865929, "grad_norm": 1.8273607752057157, "learning_rate": 4.693937707386609e-05, "loss": 1.6698, "step": 1766 }, { "epoch": 8.664206642066421, "grad_norm": 1.162743299695991, "learning_rate": 4.660401578158052e-05, "loss": 1.4114, "step": 1767 }, { "epoch": 8.669126691266912, "grad_norm": 1.0202711560845157, "learning_rate": 4.626979822255001e-05, "loss": 1.7391, "step": 1768 }, { "epoch": 8.674046740467405, "grad_norm": 0.9114482118766372, "learning_rate": 4.593672523986936e-05, "loss": 1.9821, "step": 1769 }, { "epoch": 8.678966789667896, "grad_norm": 1.1611616411693073, "learning_rate": 4.56047976737457e-05, "loss": 1.7694, "step": 1770 }, { "epoch": 8.68388683886839, "grad_norm": 1.163481630110185, "learning_rate": 4.527401636149703e-05, "loss": 1.8389, "step": 1771 }, { "epoch": 8.68880688806888, "grad_norm": 1.1193423275884535, "learning_rate": 4.4944382137549864e-05, "loss": 1.6737, "step": 1772 }, { "epoch": 8.693726937269373, "grad_norm": 0.8149845989586234, "learning_rate": 4.461589583343678e-05, "loss": 1.5693, "step": 1773 }, { "epoch": 8.698646986469864, "grad_norm": 1.1789889656822132, "learning_rate": 4.4288558277795046e-05, "loss": 1.7583, "step": 1774 }, { "epoch": 8.703567035670357, "grad_norm": 1.0265750092437849, "learning_rate": 4.3962370296363854e-05, "loss": 1.7428, "step": 1775 }, { "epoch": 8.708487084870848, "grad_norm": 0.9675931091040959, "learning_rate": 4.363733271198239e-05, "loss": 1.7009, "step": 1776 }, { "epoch": 8.71340713407134, "grad_norm": 1.4210553813204365, "learning_rate": 4.331344634458812e-05, "loss": 1.9037, "step": 1777 }, { "epoch": 8.718327183271832, "grad_norm": 1.0403454741238267, "learning_rate": 4.2990712011214236e-05, "loss": 1.5685, "step": 1778 }, { "epoch": 8.723247232472325, "grad_norm": 0.96664912897252, "learning_rate": 4.266913052598792e-05, "loss": 1.7512, "step": 1779 }, { "epoch": 8.728167281672818, "grad_norm": 1.2894585964855934, "learning_rate": 4.2348702700128305e-05, "loss": 1.5651, "step": 1780 }, { "epoch": 8.733087330873309, "grad_norm": 1.092409096529127, "learning_rate": 4.202942934194398e-05, "loss": 2.1391, "step": 1781 }, { "epoch": 8.738007380073801, "grad_norm": 0.8961894973569845, "learning_rate": 4.17113112568317e-05, "loss": 1.5865, "step": 1782 }, { "epoch": 8.742927429274292, "grad_norm": 1.2590546562671385, "learning_rate": 4.139434924727359e-05, "loss": 1.7343, "step": 1783 }, { "epoch": 8.747847478474785, "grad_norm": 1.2458987527257095, "learning_rate": 4.107854411283551e-05, "loss": 1.6016, "step": 1784 }, { "epoch": 8.752767527675276, "grad_norm": 1.526226859567536, "learning_rate": 4.0763896650165225e-05, "loss": 1.5786, "step": 1785 }, { "epoch": 8.75768757687577, "grad_norm": 0.9859463114333676, "learning_rate": 4.045040765299007e-05, "loss": 1.8845, "step": 1786 }, { "epoch": 8.76260762607626, "grad_norm": 1.5424396471480366, "learning_rate": 4.0138077912114824e-05, "loss": 1.5569, "step": 1787 }, { "epoch": 8.767527675276753, "grad_norm": 1.498388574177117, "learning_rate": 3.9826908215420345e-05, "loss": 1.4666, "step": 1788 }, { "epoch": 8.772447724477244, "grad_norm": 0.9285378526175899, "learning_rate": 3.951689934786068e-05, "loss": 1.7754, "step": 1789 }, { "epoch": 8.777367773677737, "grad_norm": 0.9962830511204391, "learning_rate": 3.920805209146205e-05, "loss": 2.012, "step": 1790 }, { "epoch": 8.782287822878228, "grad_norm": 0.7715796229192461, "learning_rate": 3.8900367225320035e-05, "loss": 1.724, "step": 1791 }, { "epoch": 8.78720787207872, "grad_norm": 1.2185676501009726, "learning_rate": 3.859384552559825e-05, "loss": 1.9291, "step": 1792 }, { "epoch": 8.792127921279214, "grad_norm": 0.9234312423800874, "learning_rate": 3.828848776552596e-05, "loss": 1.9439, "step": 1793 }, { "epoch": 8.797047970479705, "grad_norm": 1.1837849195331809, "learning_rate": 3.7984294715396274e-05, "loss": 1.8968, "step": 1794 }, { "epoch": 8.801968019680197, "grad_norm": 1.1687172054118482, "learning_rate": 3.768126714256437e-05, "loss": 1.8588, "step": 1795 }, { "epoch": 8.806888068880689, "grad_norm": 1.3964068008919728, "learning_rate": 3.7379405811445275e-05, "loss": 1.5833, "step": 1796 }, { "epoch": 8.811808118081181, "grad_norm": 1.0891610399721001, "learning_rate": 3.707871148351183e-05, "loss": 1.5543, "step": 1797 }, { "epoch": 8.816728167281672, "grad_norm": 1.1386187141870403, "learning_rate": 3.677918491729365e-05, "loss": 1.6051, "step": 1798 }, { "epoch": 8.821648216482165, "grad_norm": 2.727845228295955, "learning_rate": 3.648082686837395e-05, "loss": 1.6603, "step": 1799 }, { "epoch": 8.826568265682656, "grad_norm": 0.9650858887105672, "learning_rate": 3.618363808938846e-05, "loss": 1.6754, "step": 1800 }, { "epoch": 8.83148831488315, "grad_norm": 1.2178298693460023, "learning_rate": 3.5887619330023434e-05, "loss": 1.7683, "step": 1801 }, { "epoch": 8.83640836408364, "grad_norm": 1.3700316141727453, "learning_rate": 3.5592771337013386e-05, "loss": 1.793, "step": 1802 }, { "epoch": 8.841328413284133, "grad_norm": 1.6541167214646193, "learning_rate": 3.529909485413968e-05, "loss": 1.8864, "step": 1803 }, { "epoch": 8.846248462484624, "grad_norm": 1.7754753316121141, "learning_rate": 3.500659062222855e-05, "loss": 2.0816, "step": 1804 }, { "epoch": 8.851168511685117, "grad_norm": 1.0739325275075182, "learning_rate": 3.4715259379148655e-05, "loss": 1.8086, "step": 1805 }, { "epoch": 8.85608856088561, "grad_norm": 1.2719923664704387, "learning_rate": 3.4425101859810235e-05, "loss": 1.5868, "step": 1806 }, { "epoch": 8.8610086100861, "grad_norm": 1.5753767568777983, "learning_rate": 3.4136118796162184e-05, "loss": 1.4917, "step": 1807 }, { "epoch": 8.865928659286594, "grad_norm": 1.0361904123374601, "learning_rate": 3.38483109171912e-05, "loss": 1.5892, "step": 1808 }, { "epoch": 8.870848708487085, "grad_norm": 1.403986885724246, "learning_rate": 3.356167894891909e-05, "loss": 1.8348, "step": 1809 }, { "epoch": 8.875768757687577, "grad_norm": 1.5343589887410538, "learning_rate": 3.327622361440152e-05, "loss": 1.6214, "step": 1810 }, { "epoch": 8.880688806888068, "grad_norm": 1.0811943323455335, "learning_rate": 3.299194563372604e-05, "loss": 2.0471, "step": 1811 }, { "epoch": 8.885608856088561, "grad_norm": 1.5491136406007766, "learning_rate": 3.270884572401001e-05, "loss": 2.2322, "step": 1812 }, { "epoch": 8.890528905289052, "grad_norm": 1.7867521365503656, "learning_rate": 3.2426924599399055e-05, "loss": 1.9042, "step": 1813 }, { "epoch": 8.895448954489545, "grad_norm": 0.8550892944878195, "learning_rate": 3.214618297106536e-05, "loss": 1.7824, "step": 1814 }, { "epoch": 8.900369003690036, "grad_norm": 0.8547949817151498, "learning_rate": 3.1866621547205485e-05, "loss": 1.5642, "step": 1815 }, { "epoch": 8.905289052890529, "grad_norm": 1.5953694378587464, "learning_rate": 3.15882410330392e-05, "loss": 1.9073, "step": 1816 }, { "epoch": 8.91020910209102, "grad_norm": 1.115529634350517, "learning_rate": 3.131104213080688e-05, "loss": 1.9726, "step": 1817 }, { "epoch": 8.915129151291513, "grad_norm": 1.0371471040416336, "learning_rate": 3.1035025539768336e-05, "loss": 1.9007, "step": 1818 }, { "epoch": 8.920049200492006, "grad_norm": 0.9279658422771734, "learning_rate": 3.0760191956201114e-05, "loss": 1.8229, "step": 1819 }, { "epoch": 8.924969249692497, "grad_norm": 1.2100311595228603, "learning_rate": 3.0486542073398016e-05, "loss": 1.9348, "step": 1820 }, { "epoch": 8.92988929889299, "grad_norm": 1.207049033817677, "learning_rate": 3.021407658166636e-05, "loss": 1.4349, "step": 1821 }, { "epoch": 8.93480934809348, "grad_norm": 0.8039657219324816, "learning_rate": 2.994279616832557e-05, "loss": 1.6745, "step": 1822 }, { "epoch": 8.939729397293974, "grad_norm": 1.1498836212170411, "learning_rate": 2.9672701517705403e-05, "loss": 1.8878, "step": 1823 }, { "epoch": 8.944649446494465, "grad_norm": 1.2997863715167364, "learning_rate": 2.9403793311144623e-05, "loss": 1.7952, "step": 1824 }, { "epoch": 8.949569495694957, "grad_norm": 1.0688677107318973, "learning_rate": 2.9136072226989052e-05, "loss": 1.6517, "step": 1825 }, { "epoch": 8.954489544895448, "grad_norm": 1.2261978536910902, "learning_rate": 2.8869538940589802e-05, "loss": 2.0233, "step": 1826 }, { "epoch": 8.959409594095941, "grad_norm": 2.4521494479120323, "learning_rate": 2.860419412430165e-05, "loss": 1.8547, "step": 1827 }, { "epoch": 8.964329643296432, "grad_norm": 1.0772402144390045, "learning_rate": 2.8340038447481564e-05, "loss": 1.6255, "step": 1828 }, { "epoch": 8.969249692496925, "grad_norm": 1.025596636705251, "learning_rate": 2.807707257648662e-05, "loss": 1.678, "step": 1829 }, { "epoch": 8.974169741697416, "grad_norm": 1.200832967192393, "learning_rate": 2.781529717467246e-05, "loss": 1.6146, "step": 1830 }, { "epoch": 8.979089790897909, "grad_norm": 1.1174205867339357, "learning_rate": 2.7554712902391645e-05, "loss": 1.4478, "step": 1831 }, { "epoch": 8.984009840098402, "grad_norm": 1.3413906351921359, "learning_rate": 2.7295320416992288e-05, "loss": 1.7165, "step": 1832 }, { "epoch": 8.988929889298893, "grad_norm": 1.000778982111418, "learning_rate": 2.7037120372815636e-05, "loss": 1.7154, "step": 1833 }, { "epoch": 8.993849938499386, "grad_norm": 1.8644123281315497, "learning_rate": 2.6780113421195295e-05, "loss": 1.648, "step": 1834 }, { "epoch": 8.998769987699877, "grad_norm": 1.111396799477143, "learning_rate": 2.65243002104551e-05, "loss": 1.9966, "step": 1835 }, { "epoch": 9.0, "grad_norm": 1.111396799477143, "learning_rate": 2.6269681385907406e-05, "loss": 0.5564, "step": 1836 }, { "epoch": 9.004920049200493, "grad_norm": 1.1806178691376863, "learning_rate": 2.6016257589851823e-05, "loss": 1.9736, "step": 1837 }, { "epoch": 9.009840098400984, "grad_norm": 0.9121292037132614, "learning_rate": 2.5764029461573134e-05, "loss": 1.3871, "step": 1838 }, { "epoch": 9.014760147601477, "grad_norm": 1.7584248938272826, "learning_rate": 2.551299763734011e-05, "loss": 2.0601, "step": 1839 }, { "epoch": 9.019680196801968, "grad_norm": 1.1740401893268102, "learning_rate": 2.526316275040391e-05, "loss": 1.6248, "step": 1840 }, { "epoch": 9.02460024600246, "grad_norm": 1.3491681370399788, "learning_rate": 2.5014525430995914e-05, "loss": 1.9072, "step": 1841 }, { "epoch": 9.029520295202952, "grad_norm": 0.9409781310772187, "learning_rate": 2.47670863063269e-05, "loss": 1.5223, "step": 1842 }, { "epoch": 9.034440344403444, "grad_norm": 1.6145402403971778, "learning_rate": 2.4520846000584795e-05, "loss": 1.844, "step": 1843 }, { "epoch": 9.039360393603936, "grad_norm": 1.5043504426728962, "learning_rate": 2.4275805134933493e-05, "loss": 1.4232, "step": 1844 }, { "epoch": 9.044280442804428, "grad_norm": 1.0860961296623832, "learning_rate": 2.4031964327511314e-05, "loss": 2.069, "step": 1845 }, { "epoch": 9.04920049200492, "grad_norm": 1.6323810007175725, "learning_rate": 2.3789324193429106e-05, "loss": 1.8905, "step": 1846 }, { "epoch": 9.054120541205412, "grad_norm": 1.1122852804195322, "learning_rate": 2.354788534476915e-05, "loss": 1.6698, "step": 1847 }, { "epoch": 9.059040590405903, "grad_norm": 2.1623075423807836, "learning_rate": 2.330764839058319e-05, "loss": 1.9137, "step": 1848 }, { "epoch": 9.063960639606396, "grad_norm": 1.2642983794308742, "learning_rate": 2.3068613936891136e-05, "loss": 1.5266, "step": 1849 }, { "epoch": 9.068880688806889, "grad_norm": 1.2405146279348278, "learning_rate": 2.2830782586679477e-05, "loss": 1.6403, "step": 1850 }, { "epoch": 9.07380073800738, "grad_norm": 1.01248797327395, "learning_rate": 2.2594154939899802e-05, "loss": 1.6993, "step": 1851 }, { "epoch": 9.078720787207873, "grad_norm": 1.8779207114934513, "learning_rate": 2.235873159346702e-05, "loss": 1.9095, "step": 1852 }, { "epoch": 9.083640836408364, "grad_norm": 0.7221818429401334, "learning_rate": 2.2124513141258574e-05, "loss": 1.6935, "step": 1853 }, { "epoch": 9.088560885608857, "grad_norm": 1.1953431445695593, "learning_rate": 2.18915001741119e-05, "loss": 1.4466, "step": 1854 }, { "epoch": 9.093480934809348, "grad_norm": 1.034866816707304, "learning_rate": 2.1659693279823924e-05, "loss": 2.0679, "step": 1855 }, { "epoch": 9.09840098400984, "grad_norm": 1.2344014419168687, "learning_rate": 2.1429093043148884e-05, "loss": 1.6251, "step": 1856 }, { "epoch": 9.103321033210332, "grad_norm": 0.9575404462502418, "learning_rate": 2.1199700045797076e-05, "loss": 1.4868, "step": 1857 }, { "epoch": 9.108241082410824, "grad_norm": 1.0772247159287756, "learning_rate": 2.097151486643356e-05, "loss": 1.7038, "step": 1858 }, { "epoch": 9.113161131611315, "grad_norm": 1.3903152838662882, "learning_rate": 2.0744538080676668e-05, "loss": 2.0722, "step": 1859 }, { "epoch": 9.118081180811808, "grad_norm": 1.3887944418891673, "learning_rate": 2.0518770261096163e-05, "loss": 1.6272, "step": 1860 }, { "epoch": 9.1230012300123, "grad_norm": 1.057437445187002, "learning_rate": 2.029421197721232e-05, "loss": 1.5937, "step": 1861 }, { "epoch": 9.127921279212792, "grad_norm": 1.4411509912264633, "learning_rate": 2.0070863795494055e-05, "loss": 2.0835, "step": 1862 }, { "epoch": 9.132841328413285, "grad_norm": 0.9607397958892115, "learning_rate": 1.9848726279357964e-05, "loss": 2.1444, "step": 1863 }, { "epoch": 9.137761377613776, "grad_norm": 1.1554776582059094, "learning_rate": 1.962779998916625e-05, "loss": 1.7094, "step": 1864 }, { "epoch": 9.142681426814269, "grad_norm": 1.0832110836479254, "learning_rate": 1.9408085482225947e-05, "loss": 1.8042, "step": 1865 }, { "epoch": 9.14760147601476, "grad_norm": 1.4969218179495427, "learning_rate": 1.9189583312787306e-05, "loss": 1.8035, "step": 1866 }, { "epoch": 9.152521525215253, "grad_norm": 1.2239918371906118, "learning_rate": 1.8972294032042092e-05, "loss": 1.7448, "step": 1867 }, { "epoch": 9.157441574415744, "grad_norm": 1.2572022119391288, "learning_rate": 1.8756218188122732e-05, "loss": 1.8678, "step": 1868 }, { "epoch": 9.162361623616237, "grad_norm": 1.0478289488864625, "learning_rate": 1.8541356326100432e-05, "loss": 1.6622, "step": 1869 }, { "epoch": 9.167281672816728, "grad_norm": 1.1996096366608886, "learning_rate": 1.8327708987983915e-05, "loss": 1.6592, "step": 1870 }, { "epoch": 9.17220172201722, "grad_norm": 0.8160054258243729, "learning_rate": 1.811527671271862e-05, "loss": 1.5211, "step": 1871 }, { "epoch": 9.177121771217712, "grad_norm": 2.354580691646048, "learning_rate": 1.79040600361845e-05, "loss": 1.7464, "step": 1872 }, { "epoch": 9.182041820418204, "grad_norm": 0.5835289378583087, "learning_rate": 1.7694059491195014e-05, "loss": 1.4999, "step": 1873 }, { "epoch": 9.186961869618695, "grad_norm": 1.0673288595979544, "learning_rate": 1.7485275607496077e-05, "loss": 1.8927, "step": 1874 }, { "epoch": 9.191881918819188, "grad_norm": 1.198155753989505, "learning_rate": 1.7277708911764222e-05, "loss": 1.7641, "step": 1875 }, { "epoch": 9.196801968019681, "grad_norm": 0.9991319707658668, "learning_rate": 1.7071359927605613e-05, "loss": 1.5904, "step": 1876 }, { "epoch": 9.201722017220172, "grad_norm": 1.4395864560806426, "learning_rate": 1.6866229175554748e-05, "loss": 1.433, "step": 1877 }, { "epoch": 9.206642066420665, "grad_norm": 0.9593884001611501, "learning_rate": 1.666231717307276e-05, "loss": 1.7272, "step": 1878 }, { "epoch": 9.211562115621156, "grad_norm": 0.9349486908719573, "learning_rate": 1.6459624434546628e-05, "loss": 1.7438, "step": 1879 }, { "epoch": 9.216482164821649, "grad_norm": 1.4731831611036477, "learning_rate": 1.6258151471287396e-05, "loss": 2.2306, "step": 1880 }, { "epoch": 9.22140221402214, "grad_norm": 1.6935010798726908, "learning_rate": 1.6057898791529303e-05, "loss": 1.9716, "step": 1881 }, { "epoch": 9.226322263222633, "grad_norm": 1.0006934989357856, "learning_rate": 1.5858866900428205e-05, "loss": 1.0873, "step": 1882 }, { "epoch": 9.231242312423124, "grad_norm": 1.1101141552494582, "learning_rate": 1.5661056300060427e-05, "loss": 1.7698, "step": 1883 }, { "epoch": 9.236162361623617, "grad_norm": 1.4234467329510256, "learning_rate": 1.546446748942154e-05, "loss": 1.9231, "step": 1884 }, { "epoch": 9.241082410824108, "grad_norm": 1.7183080196452807, "learning_rate": 1.526910096442491e-05, "loss": 1.5394, "step": 1885 }, { "epoch": 9.2460024600246, "grad_norm": 0.9053256034672793, "learning_rate": 1.5074957217900641e-05, "loss": 1.8591, "step": 1886 }, { "epoch": 9.250922509225092, "grad_norm": 0.9874745973834318, "learning_rate": 1.4882036739594373e-05, "loss": 1.4848, "step": 1887 }, { "epoch": 9.255842558425584, "grad_norm": 0.8571811727481987, "learning_rate": 1.4690340016165703e-05, "loss": 1.7979, "step": 1888 }, { "epoch": 9.260762607626077, "grad_norm": 0.9976672216364484, "learning_rate": 1.4499867531187371e-05, "loss": 1.9145, "step": 1889 }, { "epoch": 9.265682656826568, "grad_norm": 0.6798710365457964, "learning_rate": 1.4310619765143862e-05, "loss": 1.7491, "step": 1890 }, { "epoch": 9.270602706027061, "grad_norm": 1.4843159391469227, "learning_rate": 1.4122597195430075e-05, "loss": 1.8606, "step": 1891 }, { "epoch": 9.275522755227552, "grad_norm": 1.2203581142608524, "learning_rate": 1.3935800296350387e-05, "loss": 1.7699, "step": 1892 }, { "epoch": 9.280442804428045, "grad_norm": 0.9130222864408652, "learning_rate": 1.3750229539117144e-05, "loss": 1.901, "step": 1893 }, { "epoch": 9.285362853628536, "grad_norm": 1.1282762106994804, "learning_rate": 1.356588539184983e-05, "loss": 1.8734, "step": 1894 }, { "epoch": 9.290282902829029, "grad_norm": 1.4147688231129543, "learning_rate": 1.3382768319573524e-05, "loss": 1.6344, "step": 1895 }, { "epoch": 9.29520295202952, "grad_norm": 0.6677758321629301, "learning_rate": 1.320087878421794e-05, "loss": 1.6467, "step": 1896 }, { "epoch": 9.300123001230013, "grad_norm": 1.489646104586918, "learning_rate": 1.3020217244616272e-05, "loss": 1.9158, "step": 1897 }, { "epoch": 9.305043050430504, "grad_norm": 0.9913052208702202, "learning_rate": 1.284078415650397e-05, "loss": 1.7481, "step": 1898 }, { "epoch": 9.309963099630997, "grad_norm": 1.1292321731842416, "learning_rate": 1.2662579972517462e-05, "loss": 1.4944, "step": 1899 }, { "epoch": 9.314883148831488, "grad_norm": 0.8423654866727116, "learning_rate": 1.2485605142193379e-05, "loss": 1.4386, "step": 1900 }, { "epoch": 9.31980319803198, "grad_norm": 0.9597229869751069, "learning_rate": 1.2309860111967054e-05, "loss": 1.7582, "step": 1901 }, { "epoch": 9.324723247232471, "grad_norm": 0.8258484998198117, "learning_rate": 1.2135345325171465e-05, "loss": 1.5908, "step": 1902 }, { "epoch": 9.329643296432964, "grad_norm": 1.6183333915281615, "learning_rate": 1.1962061222036469e-05, "loss": 1.9629, "step": 1903 }, { "epoch": 9.334563345633457, "grad_norm": 1.2381510744956405, "learning_rate": 1.1790008239687011e-05, "loss": 1.8782, "step": 1904 }, { "epoch": 9.339483394833948, "grad_norm": 0.8603143521642044, "learning_rate": 1.1619186812142856e-05, "loss": 1.4836, "step": 1905 }, { "epoch": 9.344403444034441, "grad_norm": 1.3250521506758608, "learning_rate": 1.1449597370316645e-05, "loss": 1.7228, "step": 1906 }, { "epoch": 9.349323493234932, "grad_norm": 1.0423125717101487, "learning_rate": 1.1281240342013443e-05, "loss": 2.079, "step": 1907 }, { "epoch": 9.354243542435425, "grad_norm": 0.7689430925532438, "learning_rate": 1.1114116151929531e-05, "loss": 1.7021, "step": 1908 }, { "epoch": 9.359163591635916, "grad_norm": 0.8673258399236742, "learning_rate": 1.0948225221651009e-05, "loss": 1.5855, "step": 1909 }, { "epoch": 9.364083640836409, "grad_norm": 0.9167759347482577, "learning_rate": 1.0783567969653129e-05, "loss": 1.7662, "step": 1910 }, { "epoch": 9.3690036900369, "grad_norm": 1.0439739660242093, "learning_rate": 1.0620144811299027e-05, "loss": 1.9823, "step": 1911 }, { "epoch": 9.373923739237393, "grad_norm": 0.7058291441316228, "learning_rate": 1.0457956158838544e-05, "loss": 1.5451, "step": 1912 }, { "epoch": 9.378843788437884, "grad_norm": 0.855343259410869, "learning_rate": 1.0297002421407797e-05, "loss": 1.6821, "step": 1913 }, { "epoch": 9.383763837638377, "grad_norm": 1.0583743318959407, "learning_rate": 1.0137284005027337e-05, "loss": 1.705, "step": 1914 }, { "epoch": 9.388683886838868, "grad_norm": 1.3273197034358621, "learning_rate": 9.978801312601537e-06, "loss": 1.9384, "step": 1915 }, { "epoch": 9.39360393603936, "grad_norm": 1.1573358447544475, "learning_rate": 9.821554743917826e-06, "loss": 1.9798, "step": 1916 }, { "epoch": 9.398523985239853, "grad_norm": 0.977077822890723, "learning_rate": 9.665544695645011e-06, "loss": 1.8287, "step": 1917 }, { "epoch": 9.403444034440344, "grad_norm": 0.97525215892234, "learning_rate": 9.510771561332954e-06, "loss": 1.2339, "step": 1918 }, { "epoch": 9.408364083640837, "grad_norm": 1.1486597366746611, "learning_rate": 9.357235731411173e-06, "loss": 1.6296, "step": 1919 }, { "epoch": 9.413284132841328, "grad_norm": 1.0291144931227592, "learning_rate": 9.204937593187968e-06, "loss": 1.6322, "step": 1920 }, { "epoch": 9.418204182041821, "grad_norm": 1.1988583453565496, "learning_rate": 9.053877530849463e-06, "loss": 1.8579, "step": 1921 }, { "epoch": 9.423124231242312, "grad_norm": 1.4674268377100734, "learning_rate": 8.90405592545862e-06, "loss": 1.7796, "step": 1922 }, { "epoch": 9.428044280442805, "grad_norm": 0.8458317150250078, "learning_rate": 8.755473154954341e-06, "loss": 1.6076, "step": 1923 }, { "epoch": 9.432964329643296, "grad_norm": 1.1886888339618826, "learning_rate": 8.608129594150249e-06, "loss": 1.7915, "step": 1924 }, { "epoch": 9.437884378843789, "grad_norm": 0.860041970635487, "learning_rate": 8.462025614734192e-06, "loss": 1.695, "step": 1925 }, { "epoch": 9.44280442804428, "grad_norm": 1.3201424270557527, "learning_rate": 8.317161585266963e-06, "loss": 1.5827, "step": 1926 }, { "epoch": 9.447724477244773, "grad_norm": 1.3084980350614985, "learning_rate": 8.173537871181413e-06, "loss": 1.6978, "step": 1927 }, { "epoch": 9.452644526445264, "grad_norm": 1.1309977457403355, "learning_rate": 8.031154834781618e-06, "loss": 1.8411, "step": 1928 }, { "epoch": 9.457564575645756, "grad_norm": 0.9322633534314783, "learning_rate": 7.890012835242044e-06, "loss": 1.7506, "step": 1929 }, { "epoch": 9.46248462484625, "grad_norm": 0.9303460793805016, "learning_rate": 7.750112228606276e-06, "loss": 1.5287, "step": 1930 }, { "epoch": 9.46740467404674, "grad_norm": 0.9515402251736332, "learning_rate": 7.611453367786569e-06, "loss": 1.6482, "step": 1931 }, { "epoch": 9.472324723247233, "grad_norm": 0.7799635892819466, "learning_rate": 7.4740366025627945e-06, "loss": 1.4108, "step": 1932 }, { "epoch": 9.477244772447724, "grad_norm": 0.8944049826390189, "learning_rate": 7.337862279581331e-06, "loss": 1.2453, "step": 1933 }, { "epoch": 9.482164821648217, "grad_norm": 0.7772642557109357, "learning_rate": 7.202930742354619e-06, "loss": 1.5688, "step": 1934 }, { "epoch": 9.487084870848708, "grad_norm": 0.6369987148816287, "learning_rate": 7.069242331259718e-06, "loss": 1.8392, "step": 1935 }, { "epoch": 9.492004920049201, "grad_norm": 0.9937749147084921, "learning_rate": 6.93679738353814e-06, "loss": 1.6293, "step": 1936 }, { "epoch": 9.496924969249692, "grad_norm": 1.3525126573609088, "learning_rate": 6.805596233294575e-06, "loss": 1.7248, "step": 1937 }, { "epoch": 9.501845018450185, "grad_norm": 1.6285984487382072, "learning_rate": 6.675639211495832e-06, "loss": 1.5791, "step": 1938 }, { "epoch": 9.506765067650676, "grad_norm": 0.7837066903271207, "learning_rate": 6.546926645970674e-06, "loss": 1.7182, "step": 1939 }, { "epoch": 9.511685116851169, "grad_norm": 1.0803245938538633, "learning_rate": 6.419458861408378e-06, "loss": 1.6902, "step": 1940 }, { "epoch": 9.51660516605166, "grad_norm": 0.8576774956083024, "learning_rate": 6.293236179358175e-06, "loss": 1.2841, "step": 1941 }, { "epoch": 9.521525215252153, "grad_norm": 1.2199254110748745, "learning_rate": 6.168258918228475e-06, "loss": 1.8884, "step": 1942 }, { "epoch": 9.526445264452644, "grad_norm": 1.3913682421760307, "learning_rate": 6.044527393286037e-06, "loss": 1.6293, "step": 1943 }, { "epoch": 9.531365313653136, "grad_norm": 0.7534301625272705, "learning_rate": 5.9220419166549075e-06, "loss": 1.7491, "step": 1944 }, { "epoch": 9.53628536285363, "grad_norm": 0.6281309055749617, "learning_rate": 5.800802797316151e-06, "loss": 2.0448, "step": 1945 }, { "epoch": 9.54120541205412, "grad_norm": 1.0991341303642412, "learning_rate": 5.680810341106568e-06, "loss": 1.5749, "step": 1946 }, { "epoch": 9.546125461254613, "grad_norm": 0.7776273817275857, "learning_rate": 5.562064850718252e-06, "loss": 1.5098, "step": 1947 }, { "epoch": 9.551045510455104, "grad_norm": 1.1717432885058108, "learning_rate": 5.444566625697644e-06, "loss": 1.8786, "step": 1948 }, { "epoch": 9.555965559655597, "grad_norm": 1.1017738942484567, "learning_rate": 5.328315962444874e-06, "loss": 1.9543, "step": 1949 }, { "epoch": 9.560885608856088, "grad_norm": 0.6608550978822061, "learning_rate": 5.2133131542130845e-06, "loss": 1.6919, "step": 1950 }, { "epoch": 9.565805658056581, "grad_norm": 1.025055182183759, "learning_rate": 5.0995584911072705e-06, "loss": 1.7472, "step": 1951 }, { "epoch": 9.570725707257072, "grad_norm": 1.7192078052817188, "learning_rate": 4.987052260084279e-06, "loss": 2.2952, "step": 1952 }, { "epoch": 9.575645756457565, "grad_norm": 0.8586031620315582, "learning_rate": 4.8757947449514225e-06, "loss": 1.5612, "step": 1953 }, { "epoch": 9.580565805658056, "grad_norm": 0.9726175573621527, "learning_rate": 4.76578622636592e-06, "loss": 1.6537, "step": 1954 }, { "epoch": 9.585485854858549, "grad_norm": 0.8854111558571934, "learning_rate": 4.657026981834622e-06, "loss": 1.7092, "step": 1955 }, { "epoch": 9.59040590405904, "grad_norm": 0.8627315962998274, "learning_rate": 4.549517285712678e-06, "loss": 2.031, "step": 1956 }, { "epoch": 9.595325953259533, "grad_norm": 0.8746046069238358, "learning_rate": 4.443257409203205e-06, "loss": 1.6989, "step": 1957 }, { "epoch": 9.600246002460025, "grad_norm": 0.6190073875999155, "learning_rate": 4.338247620356617e-06, "loss": 1.5269, "step": 1958 }, { "epoch": 9.605166051660516, "grad_norm": 1.3214684388083737, "learning_rate": 4.234488184069741e-06, "loss": 1.8313, "step": 1959 }, { "epoch": 9.61008610086101, "grad_norm": 0.864504838346002, "learning_rate": 4.13197936208537e-06, "loss": 1.6128, "step": 1960 }, { "epoch": 9.6150061500615, "grad_norm": 1.0019574337168475, "learning_rate": 4.03072141299149e-06, "loss": 1.7681, "step": 1961 }, { "epoch": 9.619926199261993, "grad_norm": 0.7867870933785908, "learning_rate": 3.930714592220608e-06, "loss": 1.5014, "step": 1962 }, { "epoch": 9.624846248462484, "grad_norm": 1.0976079110062698, "learning_rate": 3.831959152049202e-06, "loss": 1.9631, "step": 1963 }, { "epoch": 9.629766297662977, "grad_norm": 0.7433662462123659, "learning_rate": 3.7344553415969427e-06, "loss": 1.4281, "step": 1964 }, { "epoch": 9.634686346863468, "grad_norm": 0.749996349900797, "learning_rate": 3.6382034068263014e-06, "loss": 1.8769, "step": 1965 }, { "epoch": 9.63960639606396, "grad_norm": 0.9023391841861442, "learning_rate": 3.543203590541555e-06, "loss": 1.8692, "step": 1966 }, { "epoch": 9.644526445264452, "grad_norm": 0.9469686357331681, "learning_rate": 3.449456132388562e-06, "loss": 1.552, "step": 1967 }, { "epoch": 9.649446494464945, "grad_norm": 1.1146028329952171, "learning_rate": 3.356961268853986e-06, "loss": 1.812, "step": 1968 }, { "epoch": 9.654366543665436, "grad_norm": 0.8058267307046895, "learning_rate": 3.2657192332645747e-06, "loss": 1.9663, "step": 1969 }, { "epoch": 9.659286592865929, "grad_norm": 1.26537788171044, "learning_rate": 3.1757302557868793e-06, "loss": 1.8722, "step": 1970 }, { "epoch": 9.664206642066421, "grad_norm": 0.7401591403339268, "learning_rate": 3.0869945634263707e-06, "loss": 1.6657, "step": 1971 }, { "epoch": 9.669126691266912, "grad_norm": 0.742964546582401, "learning_rate": 2.9995123800270473e-06, "loss": 1.5688, "step": 1972 }, { "epoch": 9.674046740467405, "grad_norm": 0.624826677903814, "learning_rate": 2.913283926270771e-06, "loss": 1.3149, "step": 1973 }, { "epoch": 9.678966789667896, "grad_norm": 0.8670293102739373, "learning_rate": 2.828309419676822e-06, "loss": 1.557, "step": 1974 }, { "epoch": 9.68388683886839, "grad_norm": 1.15253622012935, "learning_rate": 2.744589074601178e-06, "loss": 1.8647, "step": 1975 }, { "epoch": 9.68880688806888, "grad_norm": 0.9864658838824784, "learning_rate": 2.662123102236236e-06, "loss": 1.9117, "step": 1976 }, { "epoch": 9.693726937269373, "grad_norm": 1.4926969204239677, "learning_rate": 2.5809117106099235e-06, "loss": 1.7924, "step": 1977 }, { "epoch": 9.698646986469864, "grad_norm": 0.653459986202843, "learning_rate": 2.500955104585534e-06, "loss": 1.9393, "step": 1978 }, { "epoch": 9.703567035670357, "grad_norm": 0.8138775269825579, "learning_rate": 2.4222534858610033e-06, "loss": 2.1043, "step": 1979 }, { "epoch": 9.708487084870848, "grad_norm": 0.8979058534888411, "learning_rate": 2.3448070529684116e-06, "loss": 1.6579, "step": 1980 }, { "epoch": 9.71340713407134, "grad_norm": 1.862765354629179, "learning_rate": 2.2686160012735934e-06, "loss": 1.6833, "step": 1981 }, { "epoch": 9.718327183271832, "grad_norm": 1.5178170890916094, "learning_rate": 2.193680522975472e-06, "loss": 1.5779, "step": 1982 }, { "epoch": 9.723247232472325, "grad_norm": 0.790962050713253, "learning_rate": 2.120000807105671e-06, "loss": 2.0662, "step": 1983 }, { "epoch": 9.728167281672818, "grad_norm": 1.0023793547736417, "learning_rate": 2.047577039528126e-06, "loss": 1.473, "step": 1984 }, { "epoch": 9.733087330873309, "grad_norm": 0.724015536180559, "learning_rate": 1.976409402938528e-06, "loss": 1.8344, "step": 1985 }, { "epoch": 9.738007380073801, "grad_norm": 0.9120088239521073, "learning_rate": 1.9064980768637164e-06, "loss": 1.8544, "step": 1986 }, { "epoch": 9.742927429274292, "grad_norm": 0.6199230468520007, "learning_rate": 1.8378432376615628e-06, "loss": 1.2538, "step": 1987 }, { "epoch": 9.747847478474785, "grad_norm": 0.8245444067221104, "learning_rate": 1.770445058520198e-06, "loss": 1.3598, "step": 1988 }, { "epoch": 9.752767527675276, "grad_norm": 0.8634846316532873, "learning_rate": 1.704303709457733e-06, "loss": 1.4574, "step": 1989 }, { "epoch": 9.75768757687577, "grad_norm": 0.8496020122680269, "learning_rate": 1.6394193573218142e-06, "loss": 1.8716, "step": 1990 }, { "epoch": 9.76260762607626, "grad_norm": 0.8608991417853307, "learning_rate": 1.5757921657892914e-06, "loss": 1.5103, "step": 1991 }, { "epoch": 9.767527675276753, "grad_norm": 1.6870194676002537, "learning_rate": 1.5134222953656074e-06, "loss": 1.6562, "step": 1992 }, { "epoch": 9.772447724477244, "grad_norm": 0.6923286066941966, "learning_rate": 1.452309903384519e-06, "loss": 1.8048, "step": 1993 }, { "epoch": 9.777367773677737, "grad_norm": 1.0347534586116063, "learning_rate": 1.392455144007654e-06, "loss": 1.6835, "step": 1994 }, { "epoch": 9.782287822878228, "grad_norm": 1.393445248250089, "learning_rate": 1.333858168224178e-06, "loss": 1.3154, "step": 1995 }, { "epoch": 9.78720787207872, "grad_norm": 0.8643067943255969, "learning_rate": 1.2765191238503503e-06, "loss": 1.5531, "step": 1996 }, { "epoch": 9.792127921279214, "grad_norm": 1.2525056526233265, "learning_rate": 1.220438155529302e-06, "loss": 1.9321, "step": 1997 }, { "epoch": 9.797047970479705, "grad_norm": 0.8992128486771933, "learning_rate": 1.165615404730369e-06, "loss": 1.6133, "step": 1998 }, { "epoch": 9.801968019680197, "grad_norm": 1.0667582908060487, "learning_rate": 1.1120510097490932e-06, "loss": 1.5207, "step": 1999 }, { "epoch": 9.806888068880689, "grad_norm": 0.7968040237160969, "learning_rate": 1.0597451057065e-06, "loss": 1.2629, "step": 2000 }, { "epoch": 9.811808118081181, "grad_norm": 0.8050298354243353, "learning_rate": 1.0086978245490986e-06, "loss": 1.956, "step": 2001 }, { "epoch": 9.816728167281672, "grad_norm": 0.9307529299389687, "learning_rate": 9.58909295048438e-07, "loss": 1.9684, "step": 2002 }, { "epoch": 9.821648216482165, "grad_norm": 0.7649806486487662, "learning_rate": 9.103796428006072e-07, "loss": 1.6952, "step": 2003 }, { "epoch": 9.826568265682656, "grad_norm": 0.8772635854732823, "learning_rate": 8.631089902261247e-07, "loss": 1.7494, "step": 2004 }, { "epoch": 9.83148831488315, "grad_norm": 0.8188978760222927, "learning_rate": 8.170974565696598e-07, "loss": 1.5855, "step": 2005 }, { "epoch": 9.83640836408364, "grad_norm": 0.7195899651498301, "learning_rate": 7.723451578995344e-07, "loss": 1.786, "step": 2006 }, { "epoch": 9.841328413284133, "grad_norm": 0.9411892687937236, "learning_rate": 7.288522071074999e-07, "loss": 1.3477, "step": 2007 }, { "epoch": 9.846248462484624, "grad_norm": 0.7739917600020032, "learning_rate": 6.866187139085711e-07, "loss": 1.562, "step": 2008 }, { "epoch": 9.851168511685117, "grad_norm": 0.9400521710261222, "learning_rate": 6.456447848406933e-07, "loss": 1.7501, "step": 2009 }, { "epoch": 9.85608856088561, "grad_norm": 1.17398993011282, "learning_rate": 6.059305232642975e-07, "loss": 1.6511, "step": 2010 }, { "epoch": 9.8610086100861, "grad_norm": 0.8466316589902197, "learning_rate": 5.674760293623016e-07, "loss": 1.7646, "step": 2011 }, { "epoch": 9.865928659286594, "grad_norm": 0.9301394272107432, "learning_rate": 5.30281400139776e-07, "loss": 1.8775, "step": 2012 }, { "epoch": 9.870848708487085, "grad_norm": 0.5094599978924221, "learning_rate": 4.943467294235559e-07, "loss": 2.0825, "step": 2013 }, { "epoch": 9.875768757687577, "grad_norm": 1.1384905300569401, "learning_rate": 4.596721078621302e-07, "loss": 1.8871, "step": 2014 }, { "epoch": 9.880688806888068, "grad_norm": 1.104650152116086, "learning_rate": 4.2625762292552994e-07, "loss": 1.6407, "step": 2015 }, { "epoch": 9.885608856088561, "grad_norm": 1.0476828689589526, "learning_rate": 3.941033589048293e-07, "loss": 1.8249, "step": 2016 }, { "epoch": 9.890528905289052, "grad_norm": 0.9024279479981275, "learning_rate": 3.632093969121453e-07, "loss": 1.6705, "step": 2017 }, { "epoch": 9.895448954489545, "grad_norm": 0.9235208708657762, "learning_rate": 3.3357581488030473e-07, "loss": 1.7782, "step": 2018 }, { "epoch": 9.900369003690036, "grad_norm": 1.2676370180558232, "learning_rate": 3.052026875628444e-07, "loss": 1.8465, "step": 2019 }, { "epoch": 9.905289052890529, "grad_norm": 0.7416325996977672, "learning_rate": 2.780900865335112e-07, "loss": 1.5277, "step": 2020 }, { "epoch": 9.91020910209102, "grad_norm": 0.9200532511346514, "learning_rate": 2.522380801863733e-07, "loss": 1.5378, "step": 2021 }, { "epoch": 9.915129151291513, "grad_norm": 1.2124285013599405, "learning_rate": 2.2764673373554256e-07, "loss": 1.7202, "step": 2022 }, { "epoch": 9.920049200492006, "grad_norm": 0.6792195806708115, "learning_rate": 2.0431610921489706e-07, "loss": 1.6023, "step": 2023 }, { "epoch": 9.924969249692497, "grad_norm": 1.1831199782967161, "learning_rate": 1.822462654781365e-07, "loss": 1.9207, "step": 2024 }, { "epoch": 9.92988929889299, "grad_norm": 1.0665113560650856, "learning_rate": 1.6143725819850463e-07, "loss": 1.5749, "step": 2025 }, { "epoch": 9.93480934809348, "grad_norm": 2.0539566160844545, "learning_rate": 1.4188913986856734e-07, "loss": 1.988, "step": 2026 }, { "epoch": 9.939729397293974, "grad_norm": 0.7056605801582343, "learning_rate": 1.236019598003235e-07, "loss": 1.6781, "step": 2027 }, { "epoch": 9.944649446494465, "grad_norm": 0.7276055842469298, "learning_rate": 1.0657576412487213e-07, "loss": 1.5018, "step": 2028 }, { "epoch": 9.949569495694957, "grad_norm": 2.0473094988536933, "learning_rate": 9.081059579235662e-08, "loss": 1.5708, "step": 2029 }, { "epoch": 9.954489544895448, "grad_norm": 0.9979005871866711, "learning_rate": 7.630649457179839e-08, "loss": 1.9254, "step": 2030 }, { "epoch": 9.959409594095941, "grad_norm": 0.632968219836749, "learning_rate": 6.306349705126335e-08, "loss": 1.6625, "step": 2031 }, { "epoch": 9.964329643296432, "grad_norm": 0.9759496224184575, "learning_rate": 5.1081636637306806e-08, "loss": 1.7273, "step": 2032 }, { "epoch": 9.969249692496925, "grad_norm": 0.6151702221314527, "learning_rate": 4.036094355541753e-08, "loss": 1.6283, "step": 2033 }, { "epoch": 9.974169741697416, "grad_norm": 0.9941304732395438, "learning_rate": 3.0901444849407157e-08, "loss": 1.5738, "step": 2034 }, { "epoch": 9.979089790897909, "grad_norm": 0.6512198932593747, "learning_rate": 2.2703164381743248e-08, "loss": 1.8018, "step": 2035 }, { "epoch": 9.984009840098402, "grad_norm": 0.9596323432377869, "learning_rate": 1.5766122833438257e-08, "loss": 1.9037, "step": 2036 }, { "epoch": 9.988929889298893, "grad_norm": 0.7041079335889304, "learning_rate": 1.009033770377199e-08, "loss": 1.2663, "step": 2037 }, { "epoch": 9.993849938499386, "grad_norm": 1.0602457669795844, "learning_rate": 5.6758233104026165e-09, "loss": 1.5727, "step": 2038 }, { "epoch": 9.998769987699877, "grad_norm": 1.2856054473435885, "learning_rate": 2.5225907894221855e-09, "loss": 1.8359, "step": 2039 }, { "epoch": 10.0, "grad_norm": 1.2856054473435885, "learning_rate": 6.306480950790672e-10, "loss": 0.5081, "step": 2040 }, { "epoch": 10.0, "step": 2040, "total_flos": 532738496569344.0, "train_loss": 2.8324778737682923, "train_runtime": 33563.3668, "train_samples_per_second": 3.874, "train_steps_per_second": 0.061 } ], "logging_steps": 1.0, "max_steps": 2040, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 532738496569344.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }