{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2835, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010582010582010583, "grad_norm": 49.71796324571249, "learning_rate": 0.0, "loss": 11.3407, "step": 1 }, { "epoch": 0.0021164021164021165, "grad_norm": 47.138920498887764, "learning_rate": 1.7605633802816901e-07, "loss": 11.3367, "step": 2 }, { "epoch": 0.0031746031746031746, "grad_norm": 45.86427493870416, "learning_rate": 3.5211267605633803e-07, "loss": 11.3782, "step": 3 }, { "epoch": 0.004232804232804233, "grad_norm": 46.592216421647926, "learning_rate": 5.28169014084507e-07, "loss": 11.4448, "step": 4 }, { "epoch": 0.005291005291005291, "grad_norm": 51.27861968325324, "learning_rate": 7.042253521126761e-07, "loss": 11.1976, "step": 5 }, { "epoch": 0.006349206349206349, "grad_norm": 48.68604904156757, "learning_rate": 8.802816901408452e-07, "loss": 11.3589, "step": 6 }, { "epoch": 0.007407407407407408, "grad_norm": 49.04868842468927, "learning_rate": 1.056338028169014e-06, "loss": 11.216, "step": 7 }, { "epoch": 0.008465608465608466, "grad_norm": 45.78424778978211, "learning_rate": 1.232394366197183e-06, "loss": 11.2126, "step": 8 }, { "epoch": 0.009523809523809525, "grad_norm": 48.288808560697824, "learning_rate": 1.4084507042253521e-06, "loss": 11.2473, "step": 9 }, { "epoch": 0.010582010582010581, "grad_norm": 47.00529127513613, "learning_rate": 1.5845070422535212e-06, "loss": 11.181, "step": 10 }, { "epoch": 0.01164021164021164, "grad_norm": 69.96523123803429, "learning_rate": 1.7605633802816904e-06, "loss": 10.1444, "step": 11 }, { "epoch": 0.012698412698412698, "grad_norm": 67.99361968629948, "learning_rate": 1.936619718309859e-06, "loss": 10.2666, "step": 12 }, { "epoch": 0.013756613756613757, "grad_norm": 82.7361163900619, "learning_rate": 2.112676056338028e-06, "loss": 9.6636, "step": 13 }, { "epoch": 0.014814814814814815, "grad_norm": 77.91883219725891, "learning_rate": 2.2887323943661975e-06, "loss": 9.7129, "step": 14 }, { "epoch": 0.015873015873015872, "grad_norm": 80.69870504421503, "learning_rate": 2.464788732394366e-06, "loss": 5.0932, "step": 15 }, { "epoch": 0.016931216931216932, "grad_norm": 73.85506610645764, "learning_rate": 2.640845070422535e-06, "loss": 4.7543, "step": 16 }, { "epoch": 0.01798941798941799, "grad_norm": 72.44122211654165, "learning_rate": 2.8169014084507042e-06, "loss": 4.7132, "step": 17 }, { "epoch": 0.01904761904761905, "grad_norm": 54.80640084894919, "learning_rate": 2.9929577464788733e-06, "loss": 3.8029, "step": 18 }, { "epoch": 0.020105820105820106, "grad_norm": 51.95247253108577, "learning_rate": 3.1690140845070423e-06, "loss": 3.7085, "step": 19 }, { "epoch": 0.021164021164021163, "grad_norm": 14.24491850890378, "learning_rate": 3.3450704225352113e-06, "loss": 2.1113, "step": 20 }, { "epoch": 0.022222222222222223, "grad_norm": 6.987157752616172, "learning_rate": 3.521126760563381e-06, "loss": 1.7438, "step": 21 }, { "epoch": 0.02328042328042328, "grad_norm": 5.830944797295927, "learning_rate": 3.6971830985915494e-06, "loss": 1.7295, "step": 22 }, { "epoch": 0.02433862433862434, "grad_norm": 5.138674915616038, "learning_rate": 3.873239436619718e-06, "loss": 1.6184, "step": 23 }, { "epoch": 0.025396825396825397, "grad_norm": 3.8763256924499605, "learning_rate": 4.0492957746478875e-06, "loss": 1.5636, "step": 24 }, { "epoch": 0.026455026455026454, "grad_norm": 4.336003458578358, "learning_rate": 4.225352112676056e-06, "loss": 1.5242, "step": 25 }, { "epoch": 0.027513227513227514, "grad_norm": 3.002308914115821, "learning_rate": 4.401408450704226e-06, "loss": 1.5592, "step": 26 }, { "epoch": 0.02857142857142857, "grad_norm": 2.211668797457168, "learning_rate": 4.577464788732395e-06, "loss": 1.3246, "step": 27 }, { "epoch": 0.02962962962962963, "grad_norm": 1.856090631854003, "learning_rate": 4.753521126760564e-06, "loss": 1.3504, "step": 28 }, { "epoch": 0.030687830687830688, "grad_norm": 1.730441467550159, "learning_rate": 4.929577464788732e-06, "loss": 1.1383, "step": 29 }, { "epoch": 0.031746031746031744, "grad_norm": 3.037278309026151, "learning_rate": 5.105633802816902e-06, "loss": 1.0615, "step": 30 }, { "epoch": 0.0328042328042328, "grad_norm": 20.463734102356668, "learning_rate": 5.28169014084507e-06, "loss": 1.1804, "step": 31 }, { "epoch": 0.033862433862433865, "grad_norm": 1.1822365397348313, "learning_rate": 5.457746478873239e-06, "loss": 1.1662, "step": 32 }, { "epoch": 0.03492063492063492, "grad_norm": 0.9353301639240665, "learning_rate": 5.6338028169014084e-06, "loss": 1.0131, "step": 33 }, { "epoch": 0.03597883597883598, "grad_norm": 0.9206265704946479, "learning_rate": 5.809859154929578e-06, "loss": 1.0732, "step": 34 }, { "epoch": 0.037037037037037035, "grad_norm": 11.424200349684694, "learning_rate": 5.9859154929577465e-06, "loss": 1.1348, "step": 35 }, { "epoch": 0.0380952380952381, "grad_norm": 0.8604419013240469, "learning_rate": 6.161971830985916e-06, "loss": 1.0904, "step": 36 }, { "epoch": 0.039153439153439155, "grad_norm": 0.779260466106171, "learning_rate": 6.338028169014085e-06, "loss": 0.9949, "step": 37 }, { "epoch": 0.04021164021164021, "grad_norm": 1.053891911809051, "learning_rate": 6.514084507042253e-06, "loss": 0.9232, "step": 38 }, { "epoch": 0.04126984126984127, "grad_norm": 0.8065280665981377, "learning_rate": 6.690140845070423e-06, "loss": 1.0275, "step": 39 }, { "epoch": 0.042328042328042326, "grad_norm": 0.5932376581872476, "learning_rate": 6.866197183098592e-06, "loss": 0.9951, "step": 40 }, { "epoch": 0.04338624338624339, "grad_norm": 0.7470084393654567, "learning_rate": 7.042253521126762e-06, "loss": 0.9576, "step": 41 }, { "epoch": 0.044444444444444446, "grad_norm": 0.7190983046762137, "learning_rate": 7.21830985915493e-06, "loss": 1.0888, "step": 42 }, { "epoch": 0.0455026455026455, "grad_norm": 0.6401594584722508, "learning_rate": 7.394366197183099e-06, "loss": 0.9013, "step": 43 }, { "epoch": 0.04656084656084656, "grad_norm": 0.5484291413984935, "learning_rate": 7.5704225352112675e-06, "loss": 1.0244, "step": 44 }, { "epoch": 0.047619047619047616, "grad_norm": 0.5552253491849386, "learning_rate": 7.746478873239436e-06, "loss": 0.9532, "step": 45 }, { "epoch": 0.04867724867724868, "grad_norm": 0.5652331360674767, "learning_rate": 7.922535211267606e-06, "loss": 0.9989, "step": 46 }, { "epoch": 0.04973544973544974, "grad_norm": 0.5293118764740489, "learning_rate": 8.098591549295775e-06, "loss": 0.9298, "step": 47 }, { "epoch": 0.050793650793650794, "grad_norm": 0.5285967447293929, "learning_rate": 8.274647887323944e-06, "loss": 0.9737, "step": 48 }, { "epoch": 0.05185185185185185, "grad_norm": 0.5178212820525971, "learning_rate": 8.450704225352112e-06, "loss": 0.9258, "step": 49 }, { "epoch": 0.05291005291005291, "grad_norm": 0.4733170168194918, "learning_rate": 8.626760563380283e-06, "loss": 0.9534, "step": 50 }, { "epoch": 0.05396825396825397, "grad_norm": 0.5149256098520258, "learning_rate": 8.802816901408451e-06, "loss": 1.0816, "step": 51 }, { "epoch": 0.05502645502645503, "grad_norm": 0.4685693575510743, "learning_rate": 8.978873239436621e-06, "loss": 1.027, "step": 52 }, { "epoch": 0.056084656084656084, "grad_norm": 0.39387712556840015, "learning_rate": 9.15492957746479e-06, "loss": 0.8691, "step": 53 }, { "epoch": 0.05714285714285714, "grad_norm": 0.420846981768801, "learning_rate": 9.330985915492959e-06, "loss": 0.8751, "step": 54 }, { "epoch": 0.0582010582010582, "grad_norm": 0.3985823470678549, "learning_rate": 9.507042253521127e-06, "loss": 0.7835, "step": 55 }, { "epoch": 0.05925925925925926, "grad_norm": 0.4015984425355995, "learning_rate": 9.683098591549296e-06, "loss": 0.8231, "step": 56 }, { "epoch": 0.06031746031746032, "grad_norm": 0.41302581392299265, "learning_rate": 9.859154929577465e-06, "loss": 0.9619, "step": 57 }, { "epoch": 0.061375661375661375, "grad_norm": 0.3973364065159126, "learning_rate": 1.0035211267605635e-05, "loss": 0.8624, "step": 58 }, { "epoch": 0.06243386243386243, "grad_norm": 0.3600376955888632, "learning_rate": 1.0211267605633803e-05, "loss": 0.9197, "step": 59 }, { "epoch": 0.06349206349206349, "grad_norm": 0.38943169766010316, "learning_rate": 1.0387323943661972e-05, "loss": 0.7723, "step": 60 }, { "epoch": 0.06455026455026455, "grad_norm": 0.40177295774598387, "learning_rate": 1.056338028169014e-05, "loss": 0.8586, "step": 61 }, { "epoch": 0.0656084656084656, "grad_norm": 0.36974015962677353, "learning_rate": 1.073943661971831e-05, "loss": 0.8905, "step": 62 }, { "epoch": 0.06666666666666667, "grad_norm": 0.3669768884030218, "learning_rate": 1.0915492957746478e-05, "loss": 0.8458, "step": 63 }, { "epoch": 0.06772486772486773, "grad_norm": 0.4194352772922352, "learning_rate": 1.1091549295774648e-05, "loss": 0.92, "step": 64 }, { "epoch": 0.06878306878306878, "grad_norm": 0.33216123239758316, "learning_rate": 1.1267605633802817e-05, "loss": 0.7894, "step": 65 }, { "epoch": 0.06984126984126984, "grad_norm": 0.32639923159440226, "learning_rate": 1.1443661971830987e-05, "loss": 0.8095, "step": 66 }, { "epoch": 0.07089947089947089, "grad_norm": 0.3829927205982677, "learning_rate": 1.1619718309859156e-05, "loss": 0.8557, "step": 67 }, { "epoch": 0.07195767195767196, "grad_norm": 0.3222649310951342, "learning_rate": 1.1795774647887324e-05, "loss": 0.8743, "step": 68 }, { "epoch": 0.07301587301587302, "grad_norm": 0.30020466526689116, "learning_rate": 1.1971830985915493e-05, "loss": 0.7349, "step": 69 }, { "epoch": 0.07407407407407407, "grad_norm": 0.3519372592943403, "learning_rate": 1.2147887323943663e-05, "loss": 0.7388, "step": 70 }, { "epoch": 0.07513227513227513, "grad_norm": 0.318183168764709, "learning_rate": 1.2323943661971832e-05, "loss": 0.7519, "step": 71 }, { "epoch": 0.0761904761904762, "grad_norm": 0.310753241584739, "learning_rate": 1.25e-05, "loss": 0.7459, "step": 72 }, { "epoch": 0.07724867724867725, "grad_norm": 0.3517552226109988, "learning_rate": 1.267605633802817e-05, "loss": 0.9719, "step": 73 }, { "epoch": 0.07830687830687831, "grad_norm": 0.3213372230642392, "learning_rate": 1.2852112676056338e-05, "loss": 0.7786, "step": 74 }, { "epoch": 0.07936507936507936, "grad_norm": 1.014002688907997, "learning_rate": 1.3028169014084506e-05, "loss": 0.8212, "step": 75 }, { "epoch": 0.08042328042328042, "grad_norm": 0.2970169734383055, "learning_rate": 1.3204225352112675e-05, "loss": 0.8132, "step": 76 }, { "epoch": 0.08148148148148149, "grad_norm": 0.305476164829128, "learning_rate": 1.3380281690140845e-05, "loss": 0.792, "step": 77 }, { "epoch": 0.08253968253968254, "grad_norm": 0.30966098583703827, "learning_rate": 1.3556338028169016e-05, "loss": 0.7983, "step": 78 }, { "epoch": 0.0835978835978836, "grad_norm": 0.35996391292327967, "learning_rate": 1.3732394366197184e-05, "loss": 0.8742, "step": 79 }, { "epoch": 0.08465608465608465, "grad_norm": 0.30968125317223627, "learning_rate": 1.3908450704225353e-05, "loss": 0.748, "step": 80 }, { "epoch": 0.08571428571428572, "grad_norm": 0.35611081185640775, "learning_rate": 1.4084507042253523e-05, "loss": 0.853, "step": 81 }, { "epoch": 0.08677248677248678, "grad_norm": 0.2966375377032811, "learning_rate": 1.4260563380281692e-05, "loss": 0.7151, "step": 82 }, { "epoch": 0.08783068783068783, "grad_norm": 0.7068384588765272, "learning_rate": 1.443661971830986e-05, "loss": 0.7145, "step": 83 }, { "epoch": 0.08888888888888889, "grad_norm": 0.356149749502437, "learning_rate": 1.4612676056338029e-05, "loss": 0.7752, "step": 84 }, { "epoch": 0.08994708994708994, "grad_norm": 0.30485993113060483, "learning_rate": 1.4788732394366198e-05, "loss": 0.6923, "step": 85 }, { "epoch": 0.091005291005291, "grad_norm": 0.3283908018328014, "learning_rate": 1.4964788732394366e-05, "loss": 0.8052, "step": 86 }, { "epoch": 0.09206349206349207, "grad_norm": 0.3272519816451936, "learning_rate": 1.5140845070422535e-05, "loss": 0.6502, "step": 87 }, { "epoch": 0.09312169312169312, "grad_norm": 0.3540380713478706, "learning_rate": 1.5316901408450704e-05, "loss": 0.8252, "step": 88 }, { "epoch": 0.09417989417989418, "grad_norm": 0.32708774961615394, "learning_rate": 1.5492957746478872e-05, "loss": 0.8256, "step": 89 }, { "epoch": 0.09523809523809523, "grad_norm": 0.3078761207400959, "learning_rate": 1.5669014084507044e-05, "loss": 0.7787, "step": 90 }, { "epoch": 0.0962962962962963, "grad_norm": 0.3085651879842953, "learning_rate": 1.5845070422535213e-05, "loss": 0.7873, "step": 91 }, { "epoch": 0.09735449735449736, "grad_norm": 0.3411588858236989, "learning_rate": 1.602112676056338e-05, "loss": 0.7641, "step": 92 }, { "epoch": 0.09841269841269841, "grad_norm": 0.30194570638829077, "learning_rate": 1.619718309859155e-05, "loss": 0.6853, "step": 93 }, { "epoch": 0.09947089947089947, "grad_norm": 0.3417269654477643, "learning_rate": 1.637323943661972e-05, "loss": 0.7916, "step": 94 }, { "epoch": 0.10052910052910052, "grad_norm": 0.31589650880434206, "learning_rate": 1.6549295774647887e-05, "loss": 0.7737, "step": 95 }, { "epoch": 0.10158730158730159, "grad_norm": 0.31367036126327846, "learning_rate": 1.6725352112676056e-05, "loss": 0.7542, "step": 96 }, { "epoch": 0.10264550264550265, "grad_norm": 0.31057819203477377, "learning_rate": 1.6901408450704224e-05, "loss": 0.6593, "step": 97 }, { "epoch": 0.1037037037037037, "grad_norm": 0.34793873952604554, "learning_rate": 1.7077464788732393e-05, "loss": 0.8028, "step": 98 }, { "epoch": 0.10476190476190476, "grad_norm": 0.3480073285637551, "learning_rate": 1.7253521126760565e-05, "loss": 0.7468, "step": 99 }, { "epoch": 0.10582010582010581, "grad_norm": 0.28880176690327597, "learning_rate": 1.7429577464788734e-05, "loss": 0.692, "step": 100 }, { "epoch": 0.10687830687830688, "grad_norm": 0.3138056785283419, "learning_rate": 1.7605633802816902e-05, "loss": 0.6864, "step": 101 }, { "epoch": 0.10793650793650794, "grad_norm": 0.32366719651656767, "learning_rate": 1.778169014084507e-05, "loss": 0.7089, "step": 102 }, { "epoch": 0.10899470899470899, "grad_norm": 0.33175345049430516, "learning_rate": 1.7957746478873243e-05, "loss": 0.8091, "step": 103 }, { "epoch": 0.11005291005291006, "grad_norm": 0.3133523678106677, "learning_rate": 1.813380281690141e-05, "loss": 0.7224, "step": 104 }, { "epoch": 0.1111111111111111, "grad_norm": 0.3061911187313938, "learning_rate": 1.830985915492958e-05, "loss": 0.7376, "step": 105 }, { "epoch": 0.11216931216931217, "grad_norm": 0.3006228205551271, "learning_rate": 1.848591549295775e-05, "loss": 0.7126, "step": 106 }, { "epoch": 0.11322751322751323, "grad_norm": 0.30813453357630666, "learning_rate": 1.8661971830985917e-05, "loss": 0.6727, "step": 107 }, { "epoch": 0.11428571428571428, "grad_norm": 0.2877128391886516, "learning_rate": 1.8838028169014086e-05, "loss": 0.661, "step": 108 }, { "epoch": 0.11534391534391535, "grad_norm": 0.3638361745005654, "learning_rate": 1.9014084507042255e-05, "loss": 0.7072, "step": 109 }, { "epoch": 0.1164021164021164, "grad_norm": 0.30920630591593606, "learning_rate": 1.9190140845070423e-05, "loss": 0.6213, "step": 110 }, { "epoch": 0.11746031746031746, "grad_norm": 0.5492672965501992, "learning_rate": 1.9366197183098592e-05, "loss": 0.7353, "step": 111 }, { "epoch": 0.11851851851851852, "grad_norm": 0.3885248761167444, "learning_rate": 1.954225352112676e-05, "loss": 0.8189, "step": 112 }, { "epoch": 0.11957671957671957, "grad_norm": 0.31766936558876047, "learning_rate": 1.971830985915493e-05, "loss": 0.7251, "step": 113 }, { "epoch": 0.12063492063492064, "grad_norm": 0.3878825328465268, "learning_rate": 1.98943661971831e-05, "loss": 0.7747, "step": 114 }, { "epoch": 0.12169312169312169, "grad_norm": 0.3260125841556687, "learning_rate": 2.007042253521127e-05, "loss": 0.6867, "step": 115 }, { "epoch": 0.12275132275132275, "grad_norm": 0.3582626153310118, "learning_rate": 2.024647887323944e-05, "loss": 0.6981, "step": 116 }, { "epoch": 0.12380952380952381, "grad_norm": 0.3585469784947143, "learning_rate": 2.0422535211267607e-05, "loss": 0.7825, "step": 117 }, { "epoch": 0.12486772486772486, "grad_norm": 0.3442159652247866, "learning_rate": 2.0598591549295776e-05, "loss": 0.7701, "step": 118 }, { "epoch": 0.1259259259259259, "grad_norm": 0.3403066620592321, "learning_rate": 2.0774647887323944e-05, "loss": 0.6793, "step": 119 }, { "epoch": 0.12698412698412698, "grad_norm": 0.33853441823411223, "learning_rate": 2.0950704225352113e-05, "loss": 0.6876, "step": 120 }, { "epoch": 0.12804232804232804, "grad_norm": 0.3153321491578789, "learning_rate": 2.112676056338028e-05, "loss": 0.5972, "step": 121 }, { "epoch": 0.1291005291005291, "grad_norm": 0.41035577764059683, "learning_rate": 2.130281690140845e-05, "loss": 0.7687, "step": 122 }, { "epoch": 0.13015873015873017, "grad_norm": 0.34224845150721145, "learning_rate": 2.147887323943662e-05, "loss": 0.6678, "step": 123 }, { "epoch": 0.1312169312169312, "grad_norm": 0.37838653308297, "learning_rate": 2.1654929577464787e-05, "loss": 0.664, "step": 124 }, { "epoch": 0.13227513227513227, "grad_norm": 0.356256058295494, "learning_rate": 2.1830985915492956e-05, "loss": 0.7416, "step": 125 }, { "epoch": 0.13333333333333333, "grad_norm": 0.3670688477705132, "learning_rate": 2.2007042253521128e-05, "loss": 0.6603, "step": 126 }, { "epoch": 0.1343915343915344, "grad_norm": 0.3696864614307807, "learning_rate": 2.2183098591549297e-05, "loss": 0.6723, "step": 127 }, { "epoch": 0.13544973544973546, "grad_norm": 0.305168106942622, "learning_rate": 2.2359154929577465e-05, "loss": 0.6295, "step": 128 }, { "epoch": 0.1365079365079365, "grad_norm": 1.5741882840916825, "learning_rate": 2.2535211267605634e-05, "loss": 0.738, "step": 129 }, { "epoch": 0.13756613756613756, "grad_norm": 0.40676802977095083, "learning_rate": 2.2711267605633806e-05, "loss": 0.6888, "step": 130 }, { "epoch": 0.13862433862433862, "grad_norm": 0.40640312190176336, "learning_rate": 2.2887323943661974e-05, "loss": 0.8524, "step": 131 }, { "epoch": 0.13968253968253969, "grad_norm": 0.3996709103146616, "learning_rate": 2.3063380281690143e-05, "loss": 0.6707, "step": 132 }, { "epoch": 0.14074074074074075, "grad_norm": 0.3626239173606667, "learning_rate": 2.323943661971831e-05, "loss": 0.7601, "step": 133 }, { "epoch": 0.14179894179894179, "grad_norm": 0.3889512118071032, "learning_rate": 2.341549295774648e-05, "loss": 0.6534, "step": 134 }, { "epoch": 0.14285714285714285, "grad_norm": 0.439542393481695, "learning_rate": 2.359154929577465e-05, "loss": 0.7289, "step": 135 }, { "epoch": 0.1439153439153439, "grad_norm": 0.36264558696354626, "learning_rate": 2.3767605633802817e-05, "loss": 0.6388, "step": 136 }, { "epoch": 0.14497354497354498, "grad_norm": 0.3630801750617645, "learning_rate": 2.3943661971830986e-05, "loss": 0.7319, "step": 137 }, { "epoch": 0.14603174603174604, "grad_norm": 0.32798334573994153, "learning_rate": 2.4119718309859158e-05, "loss": 0.7374, "step": 138 }, { "epoch": 0.14708994708994708, "grad_norm": 0.3595935453918023, "learning_rate": 2.4295774647887327e-05, "loss": 0.6395, "step": 139 }, { "epoch": 0.14814814814814814, "grad_norm": 0.40173981052190955, "learning_rate": 2.4471830985915495e-05, "loss": 0.6469, "step": 140 }, { "epoch": 0.1492063492063492, "grad_norm": 0.3567439766311727, "learning_rate": 2.4647887323943664e-05, "loss": 0.641, "step": 141 }, { "epoch": 0.15026455026455027, "grad_norm": 0.44153899522902645, "learning_rate": 2.4823943661971833e-05, "loss": 0.6945, "step": 142 }, { "epoch": 0.15132275132275133, "grad_norm": 0.41760121811058754, "learning_rate": 2.5e-05, "loss": 0.6683, "step": 143 }, { "epoch": 0.1523809523809524, "grad_norm": 0.3532367100024593, "learning_rate": 2.517605633802817e-05, "loss": 0.6531, "step": 144 }, { "epoch": 0.15343915343915343, "grad_norm": 0.36386964811751354, "learning_rate": 2.535211267605634e-05, "loss": 0.7372, "step": 145 }, { "epoch": 0.1544973544973545, "grad_norm": 0.3667487681947238, "learning_rate": 2.5528169014084507e-05, "loss": 0.6541, "step": 146 }, { "epoch": 0.15555555555555556, "grad_norm": 0.32814632317286685, "learning_rate": 2.5704225352112676e-05, "loss": 0.7053, "step": 147 }, { "epoch": 0.15661375661375662, "grad_norm": 0.36213420149359243, "learning_rate": 2.5880281690140844e-05, "loss": 0.7516, "step": 148 }, { "epoch": 0.15767195767195769, "grad_norm": 0.3388384477751442, "learning_rate": 2.6056338028169013e-05, "loss": 0.6326, "step": 149 }, { "epoch": 0.15873015873015872, "grad_norm": 0.3739254654391917, "learning_rate": 2.623239436619718e-05, "loss": 0.6875, "step": 150 }, { "epoch": 0.15978835978835979, "grad_norm": 0.37488629646986044, "learning_rate": 2.640845070422535e-05, "loss": 0.6842, "step": 151 }, { "epoch": 0.16084656084656085, "grad_norm": 0.3315805811522645, "learning_rate": 2.658450704225352e-05, "loss": 0.7401, "step": 152 }, { "epoch": 0.1619047619047619, "grad_norm": 0.32245214995851174, "learning_rate": 2.676056338028169e-05, "loss": 0.6606, "step": 153 }, { "epoch": 0.16296296296296298, "grad_norm": 0.29179792377833336, "learning_rate": 2.693661971830986e-05, "loss": 0.6626, "step": 154 }, { "epoch": 0.164021164021164, "grad_norm": 0.3645068699975821, "learning_rate": 2.711267605633803e-05, "loss": 0.7877, "step": 155 }, { "epoch": 0.16507936507936508, "grad_norm": 0.3120322106769627, "learning_rate": 2.72887323943662e-05, "loss": 0.6277, "step": 156 }, { "epoch": 0.16613756613756614, "grad_norm": 0.31281154861735205, "learning_rate": 2.746478873239437e-05, "loss": 0.6771, "step": 157 }, { "epoch": 0.1671957671957672, "grad_norm": 0.379934869718354, "learning_rate": 2.7640845070422537e-05, "loss": 0.6179, "step": 158 }, { "epoch": 0.16825396825396827, "grad_norm": 0.30999752344647463, "learning_rate": 2.7816901408450706e-05, "loss": 0.6421, "step": 159 }, { "epoch": 0.1693121693121693, "grad_norm": 0.3743307355482593, "learning_rate": 2.7992957746478874e-05, "loss": 0.795, "step": 160 }, { "epoch": 0.17037037037037037, "grad_norm": 0.38423928807562113, "learning_rate": 2.8169014084507046e-05, "loss": 0.7782, "step": 161 }, { "epoch": 0.17142857142857143, "grad_norm": 0.4125835959551516, "learning_rate": 2.8345070422535215e-05, "loss": 0.6243, "step": 162 }, { "epoch": 0.1724867724867725, "grad_norm": 0.33067536279547244, "learning_rate": 2.8521126760563384e-05, "loss": 0.6739, "step": 163 }, { "epoch": 0.17354497354497356, "grad_norm": 0.39452863321092113, "learning_rate": 2.8697183098591552e-05, "loss": 0.6706, "step": 164 }, { "epoch": 0.1746031746031746, "grad_norm": 0.33341809821413604, "learning_rate": 2.887323943661972e-05, "loss": 0.6624, "step": 165 }, { "epoch": 0.17566137566137566, "grad_norm": 0.3999598309398766, "learning_rate": 2.904929577464789e-05, "loss": 0.7741, "step": 166 }, { "epoch": 0.17671957671957672, "grad_norm": 0.348362123360907, "learning_rate": 2.9225352112676058e-05, "loss": 0.6416, "step": 167 }, { "epoch": 0.17777777777777778, "grad_norm": 0.38051950011821134, "learning_rate": 2.9401408450704227e-05, "loss": 0.7498, "step": 168 }, { "epoch": 0.17883597883597885, "grad_norm": 0.3398083012362421, "learning_rate": 2.9577464788732395e-05, "loss": 0.6188, "step": 169 }, { "epoch": 0.17989417989417988, "grad_norm": 0.3263197338387835, "learning_rate": 2.9753521126760564e-05, "loss": 0.6523, "step": 170 }, { "epoch": 0.18095238095238095, "grad_norm": 0.29768829451839607, "learning_rate": 2.9929577464788733e-05, "loss": 0.5983, "step": 171 }, { "epoch": 0.182010582010582, "grad_norm": 0.3616844593383965, "learning_rate": 3.01056338028169e-05, "loss": 0.6778, "step": 172 }, { "epoch": 0.18306878306878308, "grad_norm": 0.32445494168670697, "learning_rate": 3.028169014084507e-05, "loss": 0.5868, "step": 173 }, { "epoch": 0.18412698412698414, "grad_norm": 0.3744637982463626, "learning_rate": 3.045774647887324e-05, "loss": 0.5966, "step": 174 }, { "epoch": 0.18518518518518517, "grad_norm": 0.34514886886805735, "learning_rate": 3.063380281690141e-05, "loss": 0.735, "step": 175 }, { "epoch": 0.18624338624338624, "grad_norm": 0.3614006619260275, "learning_rate": 3.0809859154929576e-05, "loss": 0.6083, "step": 176 }, { "epoch": 0.1873015873015873, "grad_norm": 0.34851465059010306, "learning_rate": 3.0985915492957744e-05, "loss": 0.6449, "step": 177 }, { "epoch": 0.18835978835978837, "grad_norm": 0.3083459665743903, "learning_rate": 3.116197183098591e-05, "loss": 0.6277, "step": 178 }, { "epoch": 0.18941798941798943, "grad_norm": 0.34720965786750074, "learning_rate": 3.133802816901409e-05, "loss": 0.6329, "step": 179 }, { "epoch": 0.19047619047619047, "grad_norm": 0.358285604869434, "learning_rate": 3.151408450704226e-05, "loss": 0.6505, "step": 180 }, { "epoch": 0.19153439153439153, "grad_norm": 0.34474320283746784, "learning_rate": 3.1690140845070426e-05, "loss": 0.6319, "step": 181 }, { "epoch": 0.1925925925925926, "grad_norm": 0.3895257488548781, "learning_rate": 3.1866197183098594e-05, "loss": 0.8262, "step": 182 }, { "epoch": 0.19365079365079366, "grad_norm": 0.3457383269254632, "learning_rate": 3.204225352112676e-05, "loss": 0.6699, "step": 183 }, { "epoch": 0.19470899470899472, "grad_norm": 0.33258526228041074, "learning_rate": 3.221830985915493e-05, "loss": 0.6082, "step": 184 }, { "epoch": 0.19576719576719576, "grad_norm": 0.352563011531979, "learning_rate": 3.23943661971831e-05, "loss": 0.6628, "step": 185 }, { "epoch": 0.19682539682539682, "grad_norm": 0.3320170771366527, "learning_rate": 3.257042253521127e-05, "loss": 0.6525, "step": 186 }, { "epoch": 0.19788359788359788, "grad_norm": 0.32318375497210705, "learning_rate": 3.274647887323944e-05, "loss": 0.5735, "step": 187 }, { "epoch": 0.19894179894179895, "grad_norm": 0.3270203777340338, "learning_rate": 3.2922535211267606e-05, "loss": 0.5571, "step": 188 }, { "epoch": 0.2, "grad_norm": 0.3469279421040238, "learning_rate": 3.3098591549295775e-05, "loss": 0.6787, "step": 189 }, { "epoch": 0.20105820105820105, "grad_norm": 0.36765912576607546, "learning_rate": 3.327464788732394e-05, "loss": 0.7363, "step": 190 }, { "epoch": 0.2021164021164021, "grad_norm": 0.34672109468153317, "learning_rate": 3.345070422535211e-05, "loss": 0.637, "step": 191 }, { "epoch": 0.20317460317460317, "grad_norm": 0.38433667329288207, "learning_rate": 3.362676056338028e-05, "loss": 0.5852, "step": 192 }, { "epoch": 0.20423280423280424, "grad_norm": 0.38599947669557544, "learning_rate": 3.380281690140845e-05, "loss": 0.6301, "step": 193 }, { "epoch": 0.2052910052910053, "grad_norm": 0.40046349554165817, "learning_rate": 3.397887323943662e-05, "loss": 0.6748, "step": 194 }, { "epoch": 0.20634920634920634, "grad_norm": 0.3979623426535507, "learning_rate": 3.4154929577464786e-05, "loss": 0.671, "step": 195 }, { "epoch": 0.2074074074074074, "grad_norm": 0.40709279738188864, "learning_rate": 3.4330985915492955e-05, "loss": 0.6654, "step": 196 }, { "epoch": 0.20846560846560847, "grad_norm": 0.36359801057653723, "learning_rate": 3.450704225352113e-05, "loss": 0.6099, "step": 197 }, { "epoch": 0.20952380952380953, "grad_norm": 0.38002929487320863, "learning_rate": 3.46830985915493e-05, "loss": 0.7158, "step": 198 }, { "epoch": 0.2105820105820106, "grad_norm": 0.34947905555847264, "learning_rate": 3.485915492957747e-05, "loss": 0.5947, "step": 199 }, { "epoch": 0.21164021164021163, "grad_norm": 0.3671022867710289, "learning_rate": 3.5035211267605636e-05, "loss": 0.5809, "step": 200 }, { "epoch": 0.2126984126984127, "grad_norm": 0.4001809161609547, "learning_rate": 3.5211267605633805e-05, "loss": 0.6605, "step": 201 }, { "epoch": 0.21375661375661376, "grad_norm": 0.39279284072815546, "learning_rate": 3.538732394366197e-05, "loss": 0.6068, "step": 202 }, { "epoch": 0.21481481481481482, "grad_norm": 0.33271126101318277, "learning_rate": 3.556338028169014e-05, "loss": 0.5607, "step": 203 }, { "epoch": 0.21587301587301588, "grad_norm": 0.443314082948306, "learning_rate": 3.573943661971831e-05, "loss": 0.6927, "step": 204 }, { "epoch": 0.21693121693121692, "grad_norm": 0.31502780054757173, "learning_rate": 3.5915492957746486e-05, "loss": 0.6896, "step": 205 }, { "epoch": 0.21798941798941798, "grad_norm": 0.38688454724207, "learning_rate": 3.6091549295774655e-05, "loss": 0.604, "step": 206 }, { "epoch": 0.21904761904761905, "grad_norm": 0.3312260584034657, "learning_rate": 3.626760563380282e-05, "loss": 0.6305, "step": 207 }, { "epoch": 0.2201058201058201, "grad_norm": 0.34099970700466303, "learning_rate": 3.644366197183099e-05, "loss": 0.6235, "step": 208 }, { "epoch": 0.22116402116402117, "grad_norm": 0.35741901452146296, "learning_rate": 3.661971830985916e-05, "loss": 0.7602, "step": 209 }, { "epoch": 0.2222222222222222, "grad_norm": 0.3650578278726511, "learning_rate": 3.679577464788733e-05, "loss": 0.6775, "step": 210 }, { "epoch": 0.22328042328042327, "grad_norm": 0.32198145647198445, "learning_rate": 3.69718309859155e-05, "loss": 0.6514, "step": 211 }, { "epoch": 0.22433862433862434, "grad_norm": 0.34345547766950246, "learning_rate": 3.7147887323943666e-05, "loss": 0.5875, "step": 212 }, { "epoch": 0.2253968253968254, "grad_norm": 0.274703336883991, "learning_rate": 3.7323943661971835e-05, "loss": 0.5498, "step": 213 }, { "epoch": 0.22645502645502646, "grad_norm": 0.34787089300375457, "learning_rate": 3.7500000000000003e-05, "loss": 0.634, "step": 214 }, { "epoch": 0.2275132275132275, "grad_norm": 0.32858837285549813, "learning_rate": 3.767605633802817e-05, "loss": 0.7155, "step": 215 }, { "epoch": 0.22857142857142856, "grad_norm": 0.3549541250892209, "learning_rate": 3.785211267605634e-05, "loss": 0.658, "step": 216 }, { "epoch": 0.22962962962962963, "grad_norm": 0.3726458256752988, "learning_rate": 3.802816901408451e-05, "loss": 0.6604, "step": 217 }, { "epoch": 0.2306878306878307, "grad_norm": 0.3698559943959681, "learning_rate": 3.820422535211268e-05, "loss": 0.7083, "step": 218 }, { "epoch": 0.23174603174603176, "grad_norm": 0.3461402136268809, "learning_rate": 3.8380281690140847e-05, "loss": 0.6028, "step": 219 }, { "epoch": 0.2328042328042328, "grad_norm": 0.36882537497597867, "learning_rate": 3.8556338028169015e-05, "loss": 0.5933, "step": 220 }, { "epoch": 0.23386243386243386, "grad_norm": 0.32269501706477743, "learning_rate": 3.8732394366197184e-05, "loss": 0.5564, "step": 221 }, { "epoch": 0.23492063492063492, "grad_norm": 0.40676474989986633, "learning_rate": 3.890845070422535e-05, "loss": 0.5672, "step": 222 }, { "epoch": 0.23597883597883598, "grad_norm": 0.31466605195623004, "learning_rate": 3.908450704225352e-05, "loss": 0.6466, "step": 223 }, { "epoch": 0.23703703703703705, "grad_norm": 0.4176582469387856, "learning_rate": 3.926056338028169e-05, "loss": 0.588, "step": 224 }, { "epoch": 0.23809523809523808, "grad_norm": 0.3300399938594373, "learning_rate": 3.943661971830986e-05, "loss": 0.653, "step": 225 }, { "epoch": 0.23915343915343915, "grad_norm": 0.37714569544063986, "learning_rate": 3.9612676056338034e-05, "loss": 0.5, "step": 226 }, { "epoch": 0.2402116402116402, "grad_norm": 0.40552770928374765, "learning_rate": 3.97887323943662e-05, "loss": 0.6434, "step": 227 }, { "epoch": 0.24126984126984127, "grad_norm": 0.4636367646766144, "learning_rate": 3.996478873239437e-05, "loss": 0.675, "step": 228 }, { "epoch": 0.24232804232804234, "grad_norm": 0.4172347857566623, "learning_rate": 4.014084507042254e-05, "loss": 0.6065, "step": 229 }, { "epoch": 0.24338624338624337, "grad_norm": 0.5288156764553862, "learning_rate": 4.031690140845071e-05, "loss": 0.6875, "step": 230 }, { "epoch": 0.24444444444444444, "grad_norm": 0.36944896552596407, "learning_rate": 4.049295774647888e-05, "loss": 0.6154, "step": 231 }, { "epoch": 0.2455026455026455, "grad_norm": 0.4691395217338354, "learning_rate": 4.0669014084507045e-05, "loss": 0.7015, "step": 232 }, { "epoch": 0.24656084656084656, "grad_norm": 0.35238179145757065, "learning_rate": 4.0845070422535214e-05, "loss": 0.6848, "step": 233 }, { "epoch": 0.24761904761904763, "grad_norm": 0.483662014218935, "learning_rate": 4.102112676056338e-05, "loss": 0.6265, "step": 234 }, { "epoch": 0.24867724867724866, "grad_norm": 0.3765557351707901, "learning_rate": 4.119718309859155e-05, "loss": 0.685, "step": 235 }, { "epoch": 0.24973544973544973, "grad_norm": 0.43161674678571094, "learning_rate": 4.137323943661972e-05, "loss": 0.5812, "step": 236 }, { "epoch": 0.2507936507936508, "grad_norm": 0.3554164355775738, "learning_rate": 4.154929577464789e-05, "loss": 0.5958, "step": 237 }, { "epoch": 0.2518518518518518, "grad_norm": 0.48752122898965977, "learning_rate": 4.172535211267606e-05, "loss": 0.6589, "step": 238 }, { "epoch": 0.2529100529100529, "grad_norm": 0.41293508091850323, "learning_rate": 4.1901408450704226e-05, "loss": 0.6455, "step": 239 }, { "epoch": 0.25396825396825395, "grad_norm": 0.4459574033198097, "learning_rate": 4.2077464788732394e-05, "loss": 0.696, "step": 240 }, { "epoch": 0.25502645502645505, "grad_norm": 0.5261242261127165, "learning_rate": 4.225352112676056e-05, "loss": 0.612, "step": 241 }, { "epoch": 0.2560846560846561, "grad_norm": 0.4137102192090692, "learning_rate": 4.242957746478873e-05, "loss": 0.5578, "step": 242 }, { "epoch": 0.2571428571428571, "grad_norm": 0.570279904177202, "learning_rate": 4.26056338028169e-05, "loss": 0.6889, "step": 243 }, { "epoch": 0.2582010582010582, "grad_norm": 0.5430310414509273, "learning_rate": 4.278169014084507e-05, "loss": 0.6589, "step": 244 }, { "epoch": 0.25925925925925924, "grad_norm": 0.441503758930117, "learning_rate": 4.295774647887324e-05, "loss": 0.6545, "step": 245 }, { "epoch": 0.26031746031746034, "grad_norm": 0.5384182838717864, "learning_rate": 4.3133802816901406e-05, "loss": 0.6461, "step": 246 }, { "epoch": 0.2613756613756614, "grad_norm": 0.3999205740390739, "learning_rate": 4.3309859154929575e-05, "loss": 0.589, "step": 247 }, { "epoch": 0.2624338624338624, "grad_norm": 0.4747833562137057, "learning_rate": 4.348591549295774e-05, "loss": 0.6146, "step": 248 }, { "epoch": 0.2634920634920635, "grad_norm": 0.576119057299997, "learning_rate": 4.366197183098591e-05, "loss": 0.6782, "step": 249 }, { "epoch": 0.26455026455026454, "grad_norm": 0.5322525365860533, "learning_rate": 4.383802816901409e-05, "loss": 0.5898, "step": 250 }, { "epoch": 0.2656084656084656, "grad_norm": 0.5225758045153239, "learning_rate": 4.4014084507042256e-05, "loss": 0.5973, "step": 251 }, { "epoch": 0.26666666666666666, "grad_norm": 1.1554341236749963, "learning_rate": 4.4190140845070424e-05, "loss": 0.4985, "step": 252 }, { "epoch": 0.2677248677248677, "grad_norm": 0.47852793764541357, "learning_rate": 4.436619718309859e-05, "loss": 0.5225, "step": 253 }, { "epoch": 0.2687830687830688, "grad_norm": 0.41663440733827256, "learning_rate": 4.454225352112676e-05, "loss": 0.6353, "step": 254 }, { "epoch": 0.2698412698412698, "grad_norm": 0.5546294618558951, "learning_rate": 4.471830985915493e-05, "loss": 0.6392, "step": 255 }, { "epoch": 0.2708994708994709, "grad_norm": 0.37390131385634334, "learning_rate": 4.48943661971831e-05, "loss": 0.69, "step": 256 }, { "epoch": 0.27195767195767195, "grad_norm": 1.3249897376641209, "learning_rate": 4.507042253521127e-05, "loss": 0.5528, "step": 257 }, { "epoch": 0.273015873015873, "grad_norm": 0.4769685334018541, "learning_rate": 4.5246478873239436e-05, "loss": 0.5465, "step": 258 }, { "epoch": 0.2740740740740741, "grad_norm": 0.639538521986945, "learning_rate": 4.542253521126761e-05, "loss": 0.6834, "step": 259 }, { "epoch": 0.2751322751322751, "grad_norm": 0.5219205657799278, "learning_rate": 4.559859154929578e-05, "loss": 0.5925, "step": 260 }, { "epoch": 0.2761904761904762, "grad_norm": 0.47303904730740937, "learning_rate": 4.577464788732395e-05, "loss": 0.6291, "step": 261 }, { "epoch": 0.27724867724867724, "grad_norm": 0.4124587679203123, "learning_rate": 4.595070422535212e-05, "loss": 0.5333, "step": 262 }, { "epoch": 0.2783068783068783, "grad_norm": 0.39574829022247976, "learning_rate": 4.6126760563380286e-05, "loss": 0.6118, "step": 263 }, { "epoch": 0.27936507936507937, "grad_norm": 0.4906299930682528, "learning_rate": 4.6302816901408455e-05, "loss": 0.5492, "step": 264 }, { "epoch": 0.2804232804232804, "grad_norm": 0.45541184091560766, "learning_rate": 4.647887323943662e-05, "loss": 0.6135, "step": 265 }, { "epoch": 0.2814814814814815, "grad_norm": 0.3569305414446637, "learning_rate": 4.665492957746479e-05, "loss": 0.5374, "step": 266 }, { "epoch": 0.28253968253968254, "grad_norm": 0.43109469741112555, "learning_rate": 4.683098591549296e-05, "loss": 0.6969, "step": 267 }, { "epoch": 0.28359788359788357, "grad_norm": 0.34850457997668804, "learning_rate": 4.700704225352113e-05, "loss": 0.5812, "step": 268 }, { "epoch": 0.28465608465608466, "grad_norm": 0.43017148890786167, "learning_rate": 4.71830985915493e-05, "loss": 0.6669, "step": 269 }, { "epoch": 0.2857142857142857, "grad_norm": 0.3979891127651658, "learning_rate": 4.7359154929577466e-05, "loss": 0.5677, "step": 270 }, { "epoch": 0.2867724867724868, "grad_norm": 0.5535072160058837, "learning_rate": 4.7535211267605635e-05, "loss": 0.6401, "step": 271 }, { "epoch": 0.2878306878306878, "grad_norm": 0.4788371326408822, "learning_rate": 4.7711267605633804e-05, "loss": 0.6775, "step": 272 }, { "epoch": 0.28888888888888886, "grad_norm": 0.37053522739872213, "learning_rate": 4.788732394366197e-05, "loss": 0.6266, "step": 273 }, { "epoch": 0.28994708994708995, "grad_norm": 0.44280982619784764, "learning_rate": 4.806338028169015e-05, "loss": 0.6108, "step": 274 }, { "epoch": 0.291005291005291, "grad_norm": 0.34292181255238274, "learning_rate": 4.8239436619718316e-05, "loss": 0.5563, "step": 275 }, { "epoch": 0.2920634920634921, "grad_norm": 0.4678353671213705, "learning_rate": 4.8415492957746485e-05, "loss": 0.6338, "step": 276 }, { "epoch": 0.2931216931216931, "grad_norm": 0.3603311974292169, "learning_rate": 4.8591549295774653e-05, "loss": 0.565, "step": 277 }, { "epoch": 0.29417989417989415, "grad_norm": 0.4616974029069646, "learning_rate": 4.876760563380282e-05, "loss": 0.5827, "step": 278 }, { "epoch": 0.29523809523809524, "grad_norm": 0.4417612006792997, "learning_rate": 4.894366197183099e-05, "loss": 0.7125, "step": 279 }, { "epoch": 0.2962962962962963, "grad_norm": 0.6134549574544247, "learning_rate": 4.911971830985916e-05, "loss": 0.6591, "step": 280 }, { "epoch": 0.29735449735449737, "grad_norm": 0.37720738345155475, "learning_rate": 4.929577464788733e-05, "loss": 0.5585, "step": 281 }, { "epoch": 0.2984126984126984, "grad_norm": 0.4237444553725601, "learning_rate": 4.9471830985915497e-05, "loss": 0.6572, "step": 282 }, { "epoch": 0.29947089947089944, "grad_norm": 0.36275418041559837, "learning_rate": 4.9647887323943665e-05, "loss": 0.6876, "step": 283 }, { "epoch": 0.30052910052910053, "grad_norm": 0.5213976463565436, "learning_rate": 4.9823943661971834e-05, "loss": 0.6989, "step": 284 }, { "epoch": 0.30158730158730157, "grad_norm": 0.3570263641303588, "learning_rate": 5e-05, "loss": 0.6558, "step": 285 }, { "epoch": 0.30264550264550266, "grad_norm": 0.4185098108573085, "learning_rate": 4.998039984319875e-05, "loss": 0.5007, "step": 286 }, { "epoch": 0.3037037037037037, "grad_norm": 0.3850834324831248, "learning_rate": 4.9960799686397494e-05, "loss": 0.5663, "step": 287 }, { "epoch": 0.3047619047619048, "grad_norm": 0.42577227525237116, "learning_rate": 4.994119952959624e-05, "loss": 0.6345, "step": 288 }, { "epoch": 0.3058201058201058, "grad_norm": 0.365037307220747, "learning_rate": 4.9921599372794986e-05, "loss": 0.6379, "step": 289 }, { "epoch": 0.30687830687830686, "grad_norm": 0.4118237945823454, "learning_rate": 4.990199921599373e-05, "loss": 0.634, "step": 290 }, { "epoch": 0.30793650793650795, "grad_norm": 0.33860288653166326, "learning_rate": 4.988239905919248e-05, "loss": 0.5576, "step": 291 }, { "epoch": 0.308994708994709, "grad_norm": 0.4301618112703962, "learning_rate": 4.986279890239122e-05, "loss": 0.5168, "step": 292 }, { "epoch": 0.3100529100529101, "grad_norm": 0.3153318738893075, "learning_rate": 4.984319874558997e-05, "loss": 0.5811, "step": 293 }, { "epoch": 0.3111111111111111, "grad_norm": 0.363675255321121, "learning_rate": 4.982359858878871e-05, "loss": 0.5799, "step": 294 }, { "epoch": 0.31216931216931215, "grad_norm": 0.34691937369358705, "learning_rate": 4.980399843198746e-05, "loss": 0.5285, "step": 295 }, { "epoch": 0.31322751322751324, "grad_norm": 0.36271401144515686, "learning_rate": 4.9784398275186204e-05, "loss": 0.5956, "step": 296 }, { "epoch": 0.3142857142857143, "grad_norm": 0.39889753761155844, "learning_rate": 4.9764798118384946e-05, "loss": 0.6149, "step": 297 }, { "epoch": 0.31534391534391537, "grad_norm": 1.4954635983258886, "learning_rate": 4.9745197961583695e-05, "loss": 0.6453, "step": 298 }, { "epoch": 0.3164021164021164, "grad_norm": 0.453710769398266, "learning_rate": 4.9725597804782445e-05, "loss": 0.6233, "step": 299 }, { "epoch": 0.31746031746031744, "grad_norm": 0.3324927531881442, "learning_rate": 4.970599764798119e-05, "loss": 0.5543, "step": 300 }, { "epoch": 0.31851851851851853, "grad_norm": 0.3623203424678947, "learning_rate": 4.968639749117993e-05, "loss": 0.6053, "step": 301 }, { "epoch": 0.31957671957671957, "grad_norm": 0.37974058426507107, "learning_rate": 4.966679733437868e-05, "loss": 0.6463, "step": 302 }, { "epoch": 0.32063492063492066, "grad_norm": 0.37093076432402033, "learning_rate": 4.964719717757742e-05, "loss": 0.6168, "step": 303 }, { "epoch": 0.3216931216931217, "grad_norm": 0.4056043538201469, "learning_rate": 4.962759702077617e-05, "loss": 0.7495, "step": 304 }, { "epoch": 0.32275132275132273, "grad_norm": 0.328152358631735, "learning_rate": 4.960799686397491e-05, "loss": 0.5899, "step": 305 }, { "epoch": 0.3238095238095238, "grad_norm": 0.35175655000013734, "learning_rate": 4.958839670717366e-05, "loss": 0.5318, "step": 306 }, { "epoch": 0.32486772486772486, "grad_norm": 0.3569933392052981, "learning_rate": 4.9568796550372405e-05, "loss": 0.6631, "step": 307 }, { "epoch": 0.32592592592592595, "grad_norm": 0.37456493527809115, "learning_rate": 4.9549196393571154e-05, "loss": 0.7267, "step": 308 }, { "epoch": 0.326984126984127, "grad_norm": 6.48230463042188, "learning_rate": 4.9529596236769897e-05, "loss": 0.9326, "step": 309 }, { "epoch": 0.328042328042328, "grad_norm": 0.597786693643111, "learning_rate": 4.950999607996864e-05, "loss": 0.6035, "step": 310 }, { "epoch": 0.3291005291005291, "grad_norm": 1.0072574874604938, "learning_rate": 4.949039592316739e-05, "loss": 0.6678, "step": 311 }, { "epoch": 0.33015873015873015, "grad_norm": 2.015193754844906, "learning_rate": 4.947079576636614e-05, "loss": 0.6074, "step": 312 }, { "epoch": 0.33121693121693124, "grad_norm": 0.9965912669595596, "learning_rate": 4.945119560956487e-05, "loss": 0.5941, "step": 313 }, { "epoch": 0.3322751322751323, "grad_norm": 0.6804805860670657, "learning_rate": 4.943159545276362e-05, "loss": 0.6363, "step": 314 }, { "epoch": 0.3333333333333333, "grad_norm": 0.6017533613165538, "learning_rate": 4.941199529596237e-05, "loss": 0.6276, "step": 315 }, { "epoch": 0.3343915343915344, "grad_norm": 0.704554658779125, "learning_rate": 4.939239513916112e-05, "loss": 0.6388, "step": 316 }, { "epoch": 0.33544973544973544, "grad_norm": 0.5032673111541713, "learning_rate": 4.937279498235986e-05, "loss": 0.5389, "step": 317 }, { "epoch": 0.33650793650793653, "grad_norm": 0.5015885516895849, "learning_rate": 4.9353194825558606e-05, "loss": 0.5515, "step": 318 }, { "epoch": 0.33756613756613757, "grad_norm": 0.5097653076876837, "learning_rate": 4.9333594668757355e-05, "loss": 0.5483, "step": 319 }, { "epoch": 0.3386243386243386, "grad_norm": 0.689407061137572, "learning_rate": 4.93139945119561e-05, "loss": 0.6595, "step": 320 }, { "epoch": 0.3396825396825397, "grad_norm": 1.3956902752361948, "learning_rate": 4.929439435515485e-05, "loss": 0.6863, "step": 321 }, { "epoch": 0.34074074074074073, "grad_norm": 0.6477224984591293, "learning_rate": 4.927479419835359e-05, "loss": 0.6719, "step": 322 }, { "epoch": 0.3417989417989418, "grad_norm": 0.4791685078247184, "learning_rate": 4.925519404155233e-05, "loss": 0.6532, "step": 323 }, { "epoch": 0.34285714285714286, "grad_norm": 0.7647455686764629, "learning_rate": 4.923559388475108e-05, "loss": 0.5633, "step": 324 }, { "epoch": 0.3439153439153439, "grad_norm": 0.540162982463785, "learning_rate": 4.921599372794983e-05, "loss": 0.6154, "step": 325 }, { "epoch": 0.344973544973545, "grad_norm": 0.5987926929939865, "learning_rate": 4.9196393571148566e-05, "loss": 0.5485, "step": 326 }, { "epoch": 0.346031746031746, "grad_norm": 0.47012692994553495, "learning_rate": 4.9176793414347316e-05, "loss": 0.6356, "step": 327 }, { "epoch": 0.3470899470899471, "grad_norm": 0.6190282717604556, "learning_rate": 4.9157193257546065e-05, "loss": 0.737, "step": 328 }, { "epoch": 0.34814814814814815, "grad_norm": 0.48951022131560773, "learning_rate": 4.9137593100744814e-05, "loss": 0.6171, "step": 329 }, { "epoch": 0.3492063492063492, "grad_norm": 0.5704216168233464, "learning_rate": 4.911799294394355e-05, "loss": 0.6843, "step": 330 }, { "epoch": 0.3502645502645503, "grad_norm": 0.3828305070852499, "learning_rate": 4.90983927871423e-05, "loss": 0.6277, "step": 331 }, { "epoch": 0.3513227513227513, "grad_norm": 0.4899467959752788, "learning_rate": 4.907879263034105e-05, "loss": 0.5997, "step": 332 }, { "epoch": 0.3523809523809524, "grad_norm": 0.4491699388666299, "learning_rate": 4.905919247353979e-05, "loss": 0.5925, "step": 333 }, { "epoch": 0.35343915343915344, "grad_norm": 0.425441565694508, "learning_rate": 4.903959231673853e-05, "loss": 0.5301, "step": 334 }, { "epoch": 0.3544973544973545, "grad_norm": 0.45512477300228416, "learning_rate": 4.901999215993728e-05, "loss": 0.5845, "step": 335 }, { "epoch": 0.35555555555555557, "grad_norm": 10.638459759457483, "learning_rate": 4.9000392003136025e-05, "loss": 0.7328, "step": 336 }, { "epoch": 0.3566137566137566, "grad_norm": 0.6506851562077983, "learning_rate": 4.8980791846334774e-05, "loss": 0.5733, "step": 337 }, { "epoch": 0.3576719576719577, "grad_norm": 0.3597461888788251, "learning_rate": 4.896119168953352e-05, "loss": 0.5873, "step": 338 }, { "epoch": 0.35873015873015873, "grad_norm": 0.489755609131464, "learning_rate": 4.894159153273226e-05, "loss": 0.5659, "step": 339 }, { "epoch": 0.35978835978835977, "grad_norm": 0.4683550804241493, "learning_rate": 4.892199137593101e-05, "loss": 0.62, "step": 340 }, { "epoch": 0.36084656084656086, "grad_norm": 0.5994458890657961, "learning_rate": 4.890239121912976e-05, "loss": 0.6241, "step": 341 }, { "epoch": 0.3619047619047619, "grad_norm": 0.36417066783567376, "learning_rate": 4.88827910623285e-05, "loss": 0.5937, "step": 342 }, { "epoch": 0.362962962962963, "grad_norm": 0.5283081961653806, "learning_rate": 4.886319090552724e-05, "loss": 0.6184, "step": 343 }, { "epoch": 0.364021164021164, "grad_norm": 0.39305815236656894, "learning_rate": 4.884359074872599e-05, "loss": 0.6288, "step": 344 }, { "epoch": 0.36507936507936506, "grad_norm": 0.45312073034904926, "learning_rate": 4.882399059192474e-05, "loss": 0.6075, "step": 345 }, { "epoch": 0.36613756613756615, "grad_norm": 0.4058520594663374, "learning_rate": 4.8804390435123484e-05, "loss": 0.694, "step": 346 }, { "epoch": 0.3671957671957672, "grad_norm": 0.5002449512089666, "learning_rate": 4.8784790278322226e-05, "loss": 0.5748, "step": 347 }, { "epoch": 0.3682539682539683, "grad_norm": 0.4117436134851183, "learning_rate": 4.8765190121520976e-05, "loss": 0.6376, "step": 348 }, { "epoch": 0.3693121693121693, "grad_norm": 0.502404236336507, "learning_rate": 4.874558996471972e-05, "loss": 0.6094, "step": 349 }, { "epoch": 0.37037037037037035, "grad_norm": 0.40455105073614056, "learning_rate": 4.872598980791847e-05, "loss": 0.6217, "step": 350 }, { "epoch": 0.37142857142857144, "grad_norm": 0.35704360652012157, "learning_rate": 4.870638965111721e-05, "loss": 0.5024, "step": 351 }, { "epoch": 0.3724867724867725, "grad_norm": 0.4264916243948178, "learning_rate": 4.868678949431595e-05, "loss": 0.5388, "step": 352 }, { "epoch": 0.37354497354497357, "grad_norm": 0.3327240004352966, "learning_rate": 4.86671893375147e-05, "loss": 0.5935, "step": 353 }, { "epoch": 0.3746031746031746, "grad_norm": 0.41835670277009634, "learning_rate": 4.864758918071345e-05, "loss": 0.635, "step": 354 }, { "epoch": 0.37566137566137564, "grad_norm": 0.3307152093175963, "learning_rate": 4.862798902391219e-05, "loss": 0.618, "step": 355 }, { "epoch": 0.37671957671957673, "grad_norm": 0.36724207232476874, "learning_rate": 4.8608388867110936e-05, "loss": 0.5644, "step": 356 }, { "epoch": 0.37777777777777777, "grad_norm": 0.33194305741881575, "learning_rate": 4.8588788710309685e-05, "loss": 0.5842, "step": 357 }, { "epoch": 0.37883597883597886, "grad_norm": 0.35340969186734694, "learning_rate": 4.8569188553508434e-05, "loss": 0.6276, "step": 358 }, { "epoch": 0.3798941798941799, "grad_norm": 0.3326126922758559, "learning_rate": 4.854958839670718e-05, "loss": 0.6179, "step": 359 }, { "epoch": 0.38095238095238093, "grad_norm": 0.3405811498157609, "learning_rate": 4.852998823990592e-05, "loss": 0.6401, "step": 360 }, { "epoch": 0.382010582010582, "grad_norm": 0.3226096419931992, "learning_rate": 4.851038808310467e-05, "loss": 0.605, "step": 361 }, { "epoch": 0.38306878306878306, "grad_norm": 0.32239461661768787, "learning_rate": 4.849078792630341e-05, "loss": 0.6507, "step": 362 }, { "epoch": 0.38412698412698415, "grad_norm": 0.3484471583643884, "learning_rate": 4.847118776950216e-05, "loss": 0.605, "step": 363 }, { "epoch": 0.3851851851851852, "grad_norm": 0.3112894013568265, "learning_rate": 4.84515876127009e-05, "loss": 0.5174, "step": 364 }, { "epoch": 0.3862433862433862, "grad_norm": 0.2899220938284599, "learning_rate": 4.843198745589965e-05, "loss": 0.5094, "step": 365 }, { "epoch": 0.3873015873015873, "grad_norm": 0.28917708457385394, "learning_rate": 4.8412387299098395e-05, "loss": 0.5609, "step": 366 }, { "epoch": 0.38835978835978835, "grad_norm": 0.3538788286573949, "learning_rate": 4.8392787142297144e-05, "loss": 0.6225, "step": 367 }, { "epoch": 0.38941798941798944, "grad_norm": 0.3012030881102022, "learning_rate": 4.8373186985495886e-05, "loss": 0.6549, "step": 368 }, { "epoch": 0.3904761904761905, "grad_norm": 0.3265061631766346, "learning_rate": 4.835358682869463e-05, "loss": 0.5666, "step": 369 }, { "epoch": 0.3915343915343915, "grad_norm": 0.2683583970944115, "learning_rate": 4.833398667189338e-05, "loss": 0.5693, "step": 370 }, { "epoch": 0.3925925925925926, "grad_norm": 0.3579867664845553, "learning_rate": 4.831438651509213e-05, "loss": 0.6117, "step": 371 }, { "epoch": 0.39365079365079364, "grad_norm": 0.29023429064286466, "learning_rate": 4.829478635829087e-05, "loss": 0.6243, "step": 372 }, { "epoch": 0.39470899470899473, "grad_norm": 0.30720865653544066, "learning_rate": 4.827518620148961e-05, "loss": 0.6137, "step": 373 }, { "epoch": 0.39576719576719577, "grad_norm": 0.28341688168565876, "learning_rate": 4.825558604468836e-05, "loss": 0.6066, "step": 374 }, { "epoch": 0.3968253968253968, "grad_norm": 0.27817987141750344, "learning_rate": 4.8235985887887104e-05, "loss": 0.523, "step": 375 }, { "epoch": 0.3978835978835979, "grad_norm": 0.3145876613453268, "learning_rate": 4.821638573108585e-05, "loss": 0.6418, "step": 376 }, { "epoch": 0.39894179894179893, "grad_norm": 0.3096978266574846, "learning_rate": 4.8196785574284596e-05, "loss": 0.6102, "step": 377 }, { "epoch": 0.4, "grad_norm": 0.29312324806193996, "learning_rate": 4.8177185417483345e-05, "loss": 0.6043, "step": 378 }, { "epoch": 0.40105820105820106, "grad_norm": 0.3215404710144968, "learning_rate": 4.815758526068209e-05, "loss": 0.5813, "step": 379 }, { "epoch": 0.4021164021164021, "grad_norm": 0.2843918875735979, "learning_rate": 4.813798510388084e-05, "loss": 0.6212, "step": 380 }, { "epoch": 0.4031746031746032, "grad_norm": 0.30037403275169, "learning_rate": 4.811838494707958e-05, "loss": 0.6729, "step": 381 }, { "epoch": 0.4042328042328042, "grad_norm": 0.3207760435283416, "learning_rate": 4.809878479027832e-05, "loss": 0.55, "step": 382 }, { "epoch": 0.4052910052910053, "grad_norm": 0.26503156107162107, "learning_rate": 4.807918463347707e-05, "loss": 0.5419, "step": 383 }, { "epoch": 0.40634920634920635, "grad_norm": 0.2882065071584723, "learning_rate": 4.805958447667582e-05, "loss": 0.569, "step": 384 }, { "epoch": 0.4074074074074074, "grad_norm": 0.29455047481787927, "learning_rate": 4.8039984319874556e-05, "loss": 0.5042, "step": 385 }, { "epoch": 0.4084656084656085, "grad_norm": 0.29685194902902406, "learning_rate": 4.8020384163073305e-05, "loss": 0.5383, "step": 386 }, { "epoch": 0.4095238095238095, "grad_norm": 0.376863384403299, "learning_rate": 4.8000784006272054e-05, "loss": 0.6392, "step": 387 }, { "epoch": 0.4105820105820106, "grad_norm": 0.3371594913183494, "learning_rate": 4.79811838494708e-05, "loss": 0.661, "step": 388 }, { "epoch": 0.41164021164021164, "grad_norm": 0.34032184447977354, "learning_rate": 4.796158369266954e-05, "loss": 0.5174, "step": 389 }, { "epoch": 0.4126984126984127, "grad_norm": 0.3887738951246669, "learning_rate": 4.794198353586829e-05, "loss": 0.6675, "step": 390 }, { "epoch": 0.41375661375661377, "grad_norm": 0.29671883772859686, "learning_rate": 4.792238337906704e-05, "loss": 0.5057, "step": 391 }, { "epoch": 0.4148148148148148, "grad_norm": 0.35504708481760416, "learning_rate": 4.790278322226578e-05, "loss": 0.5723, "step": 392 }, { "epoch": 0.4158730158730159, "grad_norm": 0.3187145224838173, "learning_rate": 4.788318306546453e-05, "loss": 0.5921, "step": 393 }, { "epoch": 0.41693121693121693, "grad_norm": 0.296017951892389, "learning_rate": 4.786358290866327e-05, "loss": 0.5281, "step": 394 }, { "epoch": 0.41798941798941797, "grad_norm": 0.39656850609570043, "learning_rate": 4.7843982751862015e-05, "loss": 0.6467, "step": 395 }, { "epoch": 0.41904761904761906, "grad_norm": 0.32082406571324745, "learning_rate": 4.7824382595060764e-05, "loss": 0.5867, "step": 396 }, { "epoch": 0.4201058201058201, "grad_norm": 0.4041020790707597, "learning_rate": 4.780478243825951e-05, "loss": 0.533, "step": 397 }, { "epoch": 0.4211640211640212, "grad_norm": 0.38193752458595226, "learning_rate": 4.778518228145825e-05, "loss": 0.5414, "step": 398 }, { "epoch": 0.4222222222222222, "grad_norm": 0.30383562921606366, "learning_rate": 4.7765582124657e-05, "loss": 0.5696, "step": 399 }, { "epoch": 0.42328042328042326, "grad_norm": 0.3922882135971307, "learning_rate": 4.774598196785575e-05, "loss": 0.5608, "step": 400 }, { "epoch": 0.42433862433862435, "grad_norm": 0.43007597049774227, "learning_rate": 4.772638181105449e-05, "loss": 0.5647, "step": 401 }, { "epoch": 0.4253968253968254, "grad_norm": 0.5686439579885741, "learning_rate": 4.770678165425323e-05, "loss": 0.4939, "step": 402 }, { "epoch": 0.4264550264550265, "grad_norm": 0.36392633625996323, "learning_rate": 4.768718149745198e-05, "loss": 0.5443, "step": 403 }, { "epoch": 0.4275132275132275, "grad_norm": 0.3272462847928772, "learning_rate": 4.766758134065073e-05, "loss": 0.5693, "step": 404 }, { "epoch": 0.42857142857142855, "grad_norm": 0.3124002758364203, "learning_rate": 4.7647981183849473e-05, "loss": 0.6362, "step": 405 }, { "epoch": 0.42962962962962964, "grad_norm": 0.29871765035322945, "learning_rate": 4.7628381027048216e-05, "loss": 0.5147, "step": 406 }, { "epoch": 0.4306878306878307, "grad_norm": 0.3962457157308558, "learning_rate": 4.7608780870246965e-05, "loss": 0.5617, "step": 407 }, { "epoch": 0.43174603174603177, "grad_norm": 0.334309491928329, "learning_rate": 4.758918071344571e-05, "loss": 0.5507, "step": 408 }, { "epoch": 0.4328042328042328, "grad_norm": 0.3494500717283481, "learning_rate": 4.756958055664446e-05, "loss": 0.5789, "step": 409 }, { "epoch": 0.43386243386243384, "grad_norm": 0.9776571014024243, "learning_rate": 4.75499803998432e-05, "loss": 0.5971, "step": 410 }, { "epoch": 0.43492063492063493, "grad_norm": 0.36178604692099614, "learning_rate": 4.753038024304194e-05, "loss": 0.5824, "step": 411 }, { "epoch": 0.43597883597883597, "grad_norm": 0.3701183639060816, "learning_rate": 4.751078008624069e-05, "loss": 0.6046, "step": 412 }, { "epoch": 0.43703703703703706, "grad_norm": 0.37912182400136274, "learning_rate": 4.749117992943944e-05, "loss": 0.5887, "step": 413 }, { "epoch": 0.4380952380952381, "grad_norm": 0.34637123340056375, "learning_rate": 4.747157977263818e-05, "loss": 0.5773, "step": 414 }, { "epoch": 0.43915343915343913, "grad_norm": 0.4073065757199422, "learning_rate": 4.7451979615836925e-05, "loss": 0.5738, "step": 415 }, { "epoch": 0.4402116402116402, "grad_norm": 0.3344005038216227, "learning_rate": 4.7432379459035675e-05, "loss": 0.5061, "step": 416 }, { "epoch": 0.44126984126984126, "grad_norm": 0.3739197194136766, "learning_rate": 4.7412779302234424e-05, "loss": 0.585, "step": 417 }, { "epoch": 0.44232804232804235, "grad_norm": 0.6438115315588397, "learning_rate": 4.7393179145433166e-05, "loss": 0.6128, "step": 418 }, { "epoch": 0.4433862433862434, "grad_norm": 0.4877567334363792, "learning_rate": 4.737357898863191e-05, "loss": 0.6998, "step": 419 }, { "epoch": 0.4444444444444444, "grad_norm": 0.33503924359280984, "learning_rate": 4.735397883183066e-05, "loss": 0.6582, "step": 420 }, { "epoch": 0.4455026455026455, "grad_norm": 0.35412692478679975, "learning_rate": 4.73343786750294e-05, "loss": 0.5902, "step": 421 }, { "epoch": 0.44656084656084655, "grad_norm": 0.32185468036018056, "learning_rate": 4.731477851822815e-05, "loss": 0.5647, "step": 422 }, { "epoch": 0.44761904761904764, "grad_norm": 0.33447079814754294, "learning_rate": 4.729517836142689e-05, "loss": 0.5391, "step": 423 }, { "epoch": 0.4486772486772487, "grad_norm": 5.115754194500417, "learning_rate": 4.7275578204625635e-05, "loss": 0.6673, "step": 424 }, { "epoch": 0.4497354497354497, "grad_norm": 1.328092342445049, "learning_rate": 4.7255978047824384e-05, "loss": 0.7066, "step": 425 }, { "epoch": 0.4507936507936508, "grad_norm": 0.40429664376112756, "learning_rate": 4.7236377891023133e-05, "loss": 0.6268, "step": 426 }, { "epoch": 0.45185185185185184, "grad_norm": 0.35636211087696995, "learning_rate": 4.7216777734221876e-05, "loss": 0.5591, "step": 427 }, { "epoch": 0.45291005291005293, "grad_norm": 0.8540612192195892, "learning_rate": 4.719717757742062e-05, "loss": 0.5389, "step": 428 }, { "epoch": 0.45396825396825397, "grad_norm": 0.40756442415791055, "learning_rate": 4.717757742061937e-05, "loss": 0.5483, "step": 429 }, { "epoch": 0.455026455026455, "grad_norm": 0.3970491214306376, "learning_rate": 4.715797726381812e-05, "loss": 0.6473, "step": 430 }, { "epoch": 0.4560846560846561, "grad_norm": 0.33077010193886147, "learning_rate": 4.713837710701686e-05, "loss": 0.5264, "step": 431 }, { "epoch": 0.45714285714285713, "grad_norm": 0.37580554982943354, "learning_rate": 4.71187769502156e-05, "loss": 0.5222, "step": 432 }, { "epoch": 0.4582010582010582, "grad_norm": 0.4135755906441715, "learning_rate": 4.709917679341435e-05, "loss": 0.5681, "step": 433 }, { "epoch": 0.45925925925925926, "grad_norm": 0.3556795364307751, "learning_rate": 4.7079576636613094e-05, "loss": 0.616, "step": 434 }, { "epoch": 0.4603174603174603, "grad_norm": 0.3255431792109049, "learning_rate": 4.705997647981184e-05, "loss": 0.6155, "step": 435 }, { "epoch": 0.4613756613756614, "grad_norm": 0.3621995346427838, "learning_rate": 4.7040376323010585e-05, "loss": 0.5851, "step": 436 }, { "epoch": 0.4624338624338624, "grad_norm": 0.31078411014873397, "learning_rate": 4.702077616620933e-05, "loss": 0.5034, "step": 437 }, { "epoch": 0.4634920634920635, "grad_norm": 0.27503733967112304, "learning_rate": 4.700117600940808e-05, "loss": 0.5851, "step": 438 }, { "epoch": 0.46455026455026455, "grad_norm": 0.34224618740297685, "learning_rate": 4.6981575852606826e-05, "loss": 0.6285, "step": 439 }, { "epoch": 0.4656084656084656, "grad_norm": 0.27425336801037914, "learning_rate": 4.696197569580557e-05, "loss": 0.5225, "step": 440 }, { "epoch": 0.4666666666666667, "grad_norm": 0.3603257769316038, "learning_rate": 4.694237553900431e-05, "loss": 0.5921, "step": 441 }, { "epoch": 0.4677248677248677, "grad_norm": 0.32429639783867814, "learning_rate": 4.692277538220306e-05, "loss": 0.6941, "step": 442 }, { "epoch": 0.4687830687830688, "grad_norm": 0.34481993099571034, "learning_rate": 4.690317522540181e-05, "loss": 0.5203, "step": 443 }, { "epoch": 0.46984126984126984, "grad_norm": 0.3193736572025401, "learning_rate": 4.688357506860055e-05, "loss": 0.6302, "step": 444 }, { "epoch": 0.4708994708994709, "grad_norm": 1.6187717657921294, "learning_rate": 4.6863974911799295e-05, "loss": 0.6413, "step": 445 }, { "epoch": 0.47195767195767196, "grad_norm": 0.45961110444236447, "learning_rate": 4.6844374754998044e-05, "loss": 0.6542, "step": 446 }, { "epoch": 0.473015873015873, "grad_norm": 0.34842912443778445, "learning_rate": 4.682477459819679e-05, "loss": 0.5353, "step": 447 }, { "epoch": 0.4740740740740741, "grad_norm": 0.6192864098610441, "learning_rate": 4.6805174441395536e-05, "loss": 0.4736, "step": 448 }, { "epoch": 0.47513227513227513, "grad_norm": 0.43463315211498527, "learning_rate": 4.678557428459428e-05, "loss": 0.6634, "step": 449 }, { "epoch": 0.47619047619047616, "grad_norm": 0.4140965336433174, "learning_rate": 4.676597412779302e-05, "loss": 0.6842, "step": 450 }, { "epoch": 0.47724867724867726, "grad_norm": 0.3907863076094101, "learning_rate": 4.674637397099177e-05, "loss": 0.5236, "step": 451 }, { "epoch": 0.4783068783068783, "grad_norm": 0.3553573532226626, "learning_rate": 4.672677381419052e-05, "loss": 0.6322, "step": 452 }, { "epoch": 0.4793650793650794, "grad_norm": 0.32855146366281446, "learning_rate": 4.670717365738926e-05, "loss": 0.5676, "step": 453 }, { "epoch": 0.4804232804232804, "grad_norm": 0.3355219316367844, "learning_rate": 4.6687573500588004e-05, "loss": 0.6183, "step": 454 }, { "epoch": 0.48148148148148145, "grad_norm": 0.33915222521124233, "learning_rate": 4.6667973343786754e-05, "loss": 0.5957, "step": 455 }, { "epoch": 0.48253968253968255, "grad_norm": 0.2999900043336735, "learning_rate": 4.66483731869855e-05, "loss": 0.6239, "step": 456 }, { "epoch": 0.4835978835978836, "grad_norm": 1.932322840844984, "learning_rate": 4.662877303018424e-05, "loss": 0.6079, "step": 457 }, { "epoch": 0.4846560846560847, "grad_norm": 0.3841930370997371, "learning_rate": 4.660917287338299e-05, "loss": 0.5339, "step": 458 }, { "epoch": 0.4857142857142857, "grad_norm": 0.29299834519404916, "learning_rate": 4.658957271658174e-05, "loss": 0.5155, "step": 459 }, { "epoch": 0.48677248677248675, "grad_norm": 0.3508678158463962, "learning_rate": 4.656997255978048e-05, "loss": 0.5569, "step": 460 }, { "epoch": 0.48783068783068784, "grad_norm": 0.370161022664498, "learning_rate": 4.655037240297922e-05, "loss": 0.6227, "step": 461 }, { "epoch": 0.4888888888888889, "grad_norm": 0.336334840849176, "learning_rate": 4.653077224617797e-05, "loss": 0.5896, "step": 462 }, { "epoch": 0.48994708994708996, "grad_norm": 0.3387385164468496, "learning_rate": 4.651117208937672e-05, "loss": 0.5575, "step": 463 }, { "epoch": 0.491005291005291, "grad_norm": 0.29356754801120044, "learning_rate": 4.649157193257546e-05, "loss": 0.5692, "step": 464 }, { "epoch": 0.49206349206349204, "grad_norm": 0.32417509452640764, "learning_rate": 4.647197177577421e-05, "loss": 0.542, "step": 465 }, { "epoch": 0.4931216931216931, "grad_norm": 0.26738840774433187, "learning_rate": 4.6452371618972955e-05, "loss": 0.5007, "step": 466 }, { "epoch": 0.49417989417989416, "grad_norm": 0.3639897702647601, "learning_rate": 4.64327714621717e-05, "loss": 0.5834, "step": 467 }, { "epoch": 0.49523809523809526, "grad_norm": 7.352056250865767, "learning_rate": 4.641317130537045e-05, "loss": 0.7152, "step": 468 }, { "epoch": 0.4962962962962963, "grad_norm": 0.35916204859142065, "learning_rate": 4.6393571148569196e-05, "loss": 0.508, "step": 469 }, { "epoch": 0.4973544973544973, "grad_norm": 1.1916607882832544, "learning_rate": 4.637397099176793e-05, "loss": 0.716, "step": 470 }, { "epoch": 0.4984126984126984, "grad_norm": 0.2879923677665714, "learning_rate": 4.635437083496668e-05, "loss": 0.4927, "step": 471 }, { "epoch": 0.49947089947089945, "grad_norm": 0.3327630140013385, "learning_rate": 4.633477067816543e-05, "loss": 0.567, "step": 472 }, { "epoch": 0.5005291005291005, "grad_norm": 0.47964078282416894, "learning_rate": 4.631517052136417e-05, "loss": 0.5215, "step": 473 }, { "epoch": 0.5015873015873016, "grad_norm": 0.9360332803048105, "learning_rate": 4.6295570364562915e-05, "loss": 0.5973, "step": 474 }, { "epoch": 0.5026455026455027, "grad_norm": 0.39106581876004903, "learning_rate": 4.6275970207761664e-05, "loss": 0.5984, "step": 475 }, { "epoch": 0.5037037037037037, "grad_norm": 1.6307210187055834, "learning_rate": 4.6256370050960414e-05, "loss": 0.557, "step": 476 }, { "epoch": 0.5047619047619047, "grad_norm": 0.29838204099866084, "learning_rate": 4.6236769894159156e-05, "loss": 0.584, "step": 477 }, { "epoch": 0.5058201058201058, "grad_norm": 0.3257371970902655, "learning_rate": 4.62171697373579e-05, "loss": 0.5222, "step": 478 }, { "epoch": 0.5068783068783069, "grad_norm": 0.2892518749067453, "learning_rate": 4.619756958055665e-05, "loss": 0.6022, "step": 479 }, { "epoch": 0.5079365079365079, "grad_norm": 0.3395163891295951, "learning_rate": 4.617796942375539e-05, "loss": 0.6507, "step": 480 }, { "epoch": 0.508994708994709, "grad_norm": 0.29784589758599245, "learning_rate": 4.615836926695414e-05, "loss": 0.5763, "step": 481 }, { "epoch": 0.5100529100529101, "grad_norm": 0.28702137011485396, "learning_rate": 4.613876911015288e-05, "loss": 0.5317, "step": 482 }, { "epoch": 0.5111111111111111, "grad_norm": 1.920530223820589, "learning_rate": 4.6119168953351625e-05, "loss": 0.5852, "step": 483 }, { "epoch": 0.5121693121693122, "grad_norm": 0.482240940409115, "learning_rate": 4.6099568796550374e-05, "loss": 0.5854, "step": 484 }, { "epoch": 0.5132275132275133, "grad_norm": 0.39713425911160416, "learning_rate": 4.607996863974912e-05, "loss": 0.5813, "step": 485 }, { "epoch": 0.5142857142857142, "grad_norm": 0.36739851508814947, "learning_rate": 4.6060368482947866e-05, "loss": 0.6217, "step": 486 }, { "epoch": 0.5153439153439153, "grad_norm": 0.4261831012022294, "learning_rate": 4.604076832614661e-05, "loss": 0.577, "step": 487 }, { "epoch": 0.5164021164021164, "grad_norm": 4.16824575872653, "learning_rate": 4.602116816934536e-05, "loss": 0.8018, "step": 488 }, { "epoch": 0.5174603174603175, "grad_norm": 0.4804692579466171, "learning_rate": 4.6001568012544107e-05, "loss": 0.5141, "step": 489 }, { "epoch": 0.5185185185185185, "grad_norm": 0.31014473785231145, "learning_rate": 4.598196785574285e-05, "loss": 0.5435, "step": 490 }, { "epoch": 0.5195767195767196, "grad_norm": 1.2690098499967961, "learning_rate": 4.596236769894159e-05, "loss": 0.5384, "step": 491 }, { "epoch": 0.5206349206349207, "grad_norm": 0.3480203912635704, "learning_rate": 4.594276754214034e-05, "loss": 0.5376, "step": 492 }, { "epoch": 0.5216931216931217, "grad_norm": 0.35152029139562335, "learning_rate": 4.592316738533908e-05, "loss": 0.5837, "step": 493 }, { "epoch": 0.5227513227513227, "grad_norm": 0.33161605035498015, "learning_rate": 4.590356722853783e-05, "loss": 0.5044, "step": 494 }, { "epoch": 0.5238095238095238, "grad_norm": 0.36352262970279386, "learning_rate": 4.5883967071736575e-05, "loss": 0.5796, "step": 495 }, { "epoch": 0.5248677248677248, "grad_norm": 0.3237400201076874, "learning_rate": 4.586436691493532e-05, "loss": 0.5769, "step": 496 }, { "epoch": 0.5259259259259259, "grad_norm": 2.9866473245279956, "learning_rate": 4.584476675813407e-05, "loss": 0.7518, "step": 497 }, { "epoch": 0.526984126984127, "grad_norm": 0.6522215400738879, "learning_rate": 4.5825166601332816e-05, "loss": 0.5486, "step": 498 }, { "epoch": 0.5280423280423281, "grad_norm": 0.42283674583260256, "learning_rate": 4.580556644453156e-05, "loss": 0.5667, "step": 499 }, { "epoch": 0.5291005291005291, "grad_norm": 0.584791999049459, "learning_rate": 4.57859662877303e-05, "loss": 0.5344, "step": 500 }, { "epoch": 0.5301587301587302, "grad_norm": 0.40242508358942275, "learning_rate": 4.576636613092905e-05, "loss": 0.5313, "step": 501 }, { "epoch": 0.5312169312169313, "grad_norm": 0.745312348825906, "learning_rate": 4.57467659741278e-05, "loss": 0.5376, "step": 502 }, { "epoch": 0.5322751322751322, "grad_norm": 0.49986791711058115, "learning_rate": 4.572716581732654e-05, "loss": 0.6753, "step": 503 }, { "epoch": 0.5333333333333333, "grad_norm": 0.42496269751477106, "learning_rate": 4.5707565660525285e-05, "loss": 0.5841, "step": 504 }, { "epoch": 0.5343915343915344, "grad_norm": 0.4181393322777193, "learning_rate": 4.5687965503724034e-05, "loss": 0.5555, "step": 505 }, { "epoch": 0.5354497354497354, "grad_norm": 0.3771184252890316, "learning_rate": 4.5668365346922776e-05, "loss": 0.5982, "step": 506 }, { "epoch": 0.5365079365079365, "grad_norm": 0.47820059656385455, "learning_rate": 4.5648765190121526e-05, "loss": 0.633, "step": 507 }, { "epoch": 0.5375661375661376, "grad_norm": 0.31449523301220705, "learning_rate": 4.562916503332027e-05, "loss": 0.5999, "step": 508 }, { "epoch": 0.5386243386243387, "grad_norm": 5.245441857198801, "learning_rate": 4.560956487651901e-05, "loss": 0.6226, "step": 509 }, { "epoch": 0.5396825396825397, "grad_norm": 0.64012171457468, "learning_rate": 4.558996471971776e-05, "loss": 0.6185, "step": 510 }, { "epoch": 0.5407407407407407, "grad_norm": 0.354568866388443, "learning_rate": 4.557036456291651e-05, "loss": 0.6311, "step": 511 }, { "epoch": 0.5417989417989418, "grad_norm": 1.9487919605291462, "learning_rate": 4.555076440611525e-05, "loss": 0.5987, "step": 512 }, { "epoch": 0.5428571428571428, "grad_norm": 0.6933984738129275, "learning_rate": 4.5531164249313994e-05, "loss": 0.6377, "step": 513 }, { "epoch": 0.5439153439153439, "grad_norm": 0.4755759149794088, "learning_rate": 4.551156409251274e-05, "loss": 0.5982, "step": 514 }, { "epoch": 0.544973544973545, "grad_norm": 0.650297043746455, "learning_rate": 4.549196393571149e-05, "loss": 0.5998, "step": 515 }, { "epoch": 0.546031746031746, "grad_norm": 0.5279264918555835, "learning_rate": 4.5472363778910235e-05, "loss": 0.5869, "step": 516 }, { "epoch": 0.5470899470899471, "grad_norm": 0.6614038657390391, "learning_rate": 4.545276362210898e-05, "loss": 0.5718, "step": 517 }, { "epoch": 0.5481481481481482, "grad_norm": 0.4181458689972182, "learning_rate": 4.543316346530773e-05, "loss": 0.5577, "step": 518 }, { "epoch": 0.5492063492063493, "grad_norm": 4.552268896504232, "learning_rate": 4.541356330850647e-05, "loss": 0.7083, "step": 519 }, { "epoch": 0.5502645502645502, "grad_norm": 33.061004547207546, "learning_rate": 4.539396315170522e-05, "loss": 0.8391, "step": 520 }, { "epoch": 0.5513227513227513, "grad_norm": 1.2664930712290081, "learning_rate": 4.537436299490396e-05, "loss": 0.5705, "step": 521 }, { "epoch": 0.5523809523809524, "grad_norm": 0.42354787993108955, "learning_rate": 4.5354762838102704e-05, "loss": 0.5052, "step": 522 }, { "epoch": 0.5534391534391534, "grad_norm": 0.8388212731855088, "learning_rate": 4.533516268130145e-05, "loss": 0.5349, "step": 523 }, { "epoch": 0.5544973544973545, "grad_norm": 0.7949298334624874, "learning_rate": 4.53155625245002e-05, "loss": 0.5497, "step": 524 }, { "epoch": 0.5555555555555556, "grad_norm": 0.44871889970887846, "learning_rate": 4.5295962367698945e-05, "loss": 0.5978, "step": 525 }, { "epoch": 0.5566137566137566, "grad_norm": 0.7676280541833576, "learning_rate": 4.527636221089769e-05, "loss": 0.551, "step": 526 }, { "epoch": 0.5576719576719577, "grad_norm": 0.7002659965437515, "learning_rate": 4.5256762054096436e-05, "loss": 0.5808, "step": 527 }, { "epoch": 0.5587301587301587, "grad_norm": 1.2074663454141952, "learning_rate": 4.5237161897295186e-05, "loss": 0.6605, "step": 528 }, { "epoch": 0.5597883597883598, "grad_norm": 0.712483552007316, "learning_rate": 4.521756174049392e-05, "loss": 0.618, "step": 529 }, { "epoch": 0.5608465608465608, "grad_norm": 0.331920911665764, "learning_rate": 4.519796158369267e-05, "loss": 0.5375, "step": 530 }, { "epoch": 0.5619047619047619, "grad_norm": 0.598217747836947, "learning_rate": 4.517836142689142e-05, "loss": 0.5801, "step": 531 }, { "epoch": 0.562962962962963, "grad_norm": 0.39707437203429746, "learning_rate": 4.515876127009016e-05, "loss": 0.5298, "step": 532 }, { "epoch": 0.564021164021164, "grad_norm": 0.33682693857579143, "learning_rate": 4.5139161113288905e-05, "loss": 0.5137, "step": 533 }, { "epoch": 0.5650793650793651, "grad_norm": 0.48164899152667084, "learning_rate": 4.5119560956487654e-05, "loss": 0.5781, "step": 534 }, { "epoch": 0.5661375661375662, "grad_norm": 0.3219468021100436, "learning_rate": 4.5099960799686397e-05, "loss": 0.4857, "step": 535 }, { "epoch": 0.5671957671957671, "grad_norm": 0.3320548497953154, "learning_rate": 4.5080360642885146e-05, "loss": 0.5725, "step": 536 }, { "epoch": 0.5682539682539682, "grad_norm": 0.4483913400362615, "learning_rate": 4.5060760486083895e-05, "loss": 0.4854, "step": 537 }, { "epoch": 0.5693121693121693, "grad_norm": 0.36430948857681866, "learning_rate": 4.504116032928264e-05, "loss": 0.662, "step": 538 }, { "epoch": 0.5703703703703704, "grad_norm": 0.342825639833936, "learning_rate": 4.502156017248138e-05, "loss": 0.5921, "step": 539 }, { "epoch": 0.5714285714285714, "grad_norm": 0.3734526413757713, "learning_rate": 4.500196001568013e-05, "loss": 0.5297, "step": 540 }, { "epoch": 0.5724867724867725, "grad_norm": 0.3388581653822819, "learning_rate": 4.498235985887888e-05, "loss": 0.6147, "step": 541 }, { "epoch": 0.5735449735449736, "grad_norm": 0.3205452586395781, "learning_rate": 4.4962759702077614e-05, "loss": 0.5175, "step": 542 }, { "epoch": 0.5746031746031746, "grad_norm": 0.34747287406127814, "learning_rate": 4.4943159545276363e-05, "loss": 0.588, "step": 543 }, { "epoch": 0.5756613756613757, "grad_norm": 0.3293870457786716, "learning_rate": 4.492355938847511e-05, "loss": 0.6207, "step": 544 }, { "epoch": 0.5767195767195767, "grad_norm": 1.5911037019280505, "learning_rate": 4.4903959231673855e-05, "loss": 0.6216, "step": 545 }, { "epoch": 0.5777777777777777, "grad_norm": 0.2970675826508983, "learning_rate": 4.48843590748726e-05, "loss": 0.543, "step": 546 }, { "epoch": 0.5788359788359788, "grad_norm": 0.3347900655603031, "learning_rate": 4.486475891807135e-05, "loss": 0.5524, "step": 547 }, { "epoch": 0.5798941798941799, "grad_norm": 0.3297134029351915, "learning_rate": 4.484515876127009e-05, "loss": 0.5289, "step": 548 }, { "epoch": 0.580952380952381, "grad_norm": 0.33203980488440166, "learning_rate": 4.482555860446884e-05, "loss": 0.5628, "step": 549 }, { "epoch": 0.582010582010582, "grad_norm": 0.3675901822117606, "learning_rate": 4.480595844766758e-05, "loss": 0.6431, "step": 550 }, { "epoch": 0.5830687830687831, "grad_norm": 8.35298256158783, "learning_rate": 4.478635829086633e-05, "loss": 0.6803, "step": 551 }, { "epoch": 0.5841269841269842, "grad_norm": 0.38944467670316385, "learning_rate": 4.476675813406507e-05, "loss": 0.6053, "step": 552 }, { "epoch": 0.5851851851851851, "grad_norm": 1.8390547635610588, "learning_rate": 4.474715797726382e-05, "loss": 0.5739, "step": 553 }, { "epoch": 0.5862433862433862, "grad_norm": 0.3840415785962572, "learning_rate": 4.4727557820462565e-05, "loss": 0.6275, "step": 554 }, { "epoch": 0.5873015873015873, "grad_norm": 0.29418089397003, "learning_rate": 4.470795766366131e-05, "loss": 0.535, "step": 555 }, { "epoch": 0.5883597883597883, "grad_norm": 0.3736504054747309, "learning_rate": 4.4688357506860056e-05, "loss": 0.5943, "step": 556 }, { "epoch": 0.5894179894179894, "grad_norm": 0.33285236147227376, "learning_rate": 4.4668757350058806e-05, "loss": 0.5586, "step": 557 }, { "epoch": 0.5904761904761905, "grad_norm": 0.2903484444203814, "learning_rate": 4.464915719325755e-05, "loss": 0.5365, "step": 558 }, { "epoch": 0.5915343915343916, "grad_norm": 5.46404606246135, "learning_rate": 4.462955703645629e-05, "loss": 0.6439, "step": 559 }, { "epoch": 0.5925925925925926, "grad_norm": 0.4599968468845947, "learning_rate": 4.460995687965504e-05, "loss": 0.6976, "step": 560 }, { "epoch": 0.5936507936507937, "grad_norm": 0.2982395341967786, "learning_rate": 4.459035672285379e-05, "loss": 0.5041, "step": 561 }, { "epoch": 0.5947089947089947, "grad_norm": 0.30282795685306324, "learning_rate": 4.457075656605253e-05, "loss": 0.5909, "step": 562 }, { "epoch": 0.5957671957671957, "grad_norm": 0.39972890415638074, "learning_rate": 4.4551156409251274e-05, "loss": 0.6235, "step": 563 }, { "epoch": 0.5968253968253968, "grad_norm": 0.2873678600174425, "learning_rate": 4.4531556252450023e-05, "loss": 0.5421, "step": 564 }, { "epoch": 0.5978835978835979, "grad_norm": 0.3559983769537934, "learning_rate": 4.4511956095648766e-05, "loss": 0.5561, "step": 565 }, { "epoch": 0.5989417989417989, "grad_norm": 0.3147975278256888, "learning_rate": 4.4492355938847515e-05, "loss": 0.4711, "step": 566 }, { "epoch": 0.6, "grad_norm": 0.2607641173456458, "learning_rate": 4.447275578204626e-05, "loss": 0.5823, "step": 567 }, { "epoch": 0.6010582010582011, "grad_norm": 0.2926523078468275, "learning_rate": 4.4453155625245e-05, "loss": 0.5406, "step": 568 }, { "epoch": 0.6021164021164022, "grad_norm": 0.3452463175268165, "learning_rate": 4.443355546844375e-05, "loss": 0.5894, "step": 569 }, { "epoch": 0.6031746031746031, "grad_norm": 0.24630431433470665, "learning_rate": 4.44139553116425e-05, "loss": 0.517, "step": 570 }, { "epoch": 0.6042328042328042, "grad_norm": 0.3401893647581997, "learning_rate": 4.439435515484124e-05, "loss": 0.5647, "step": 571 }, { "epoch": 0.6052910052910053, "grad_norm": 0.3208322781574765, "learning_rate": 4.4374754998039984e-05, "loss": 0.5116, "step": 572 }, { "epoch": 0.6063492063492063, "grad_norm": 0.28383141803835954, "learning_rate": 4.435515484123873e-05, "loss": 0.5441, "step": 573 }, { "epoch": 0.6074074074074074, "grad_norm": 0.34262017652008814, "learning_rate": 4.433555468443748e-05, "loss": 0.5972, "step": 574 }, { "epoch": 0.6084656084656085, "grad_norm": 0.31557046384544746, "learning_rate": 4.4315954527636225e-05, "loss": 0.6478, "step": 575 }, { "epoch": 0.6095238095238096, "grad_norm": 0.31340622501168497, "learning_rate": 4.429635437083497e-05, "loss": 0.561, "step": 576 }, { "epoch": 0.6105820105820106, "grad_norm": 0.27478250517189123, "learning_rate": 4.4276754214033716e-05, "loss": 0.6372, "step": 577 }, { "epoch": 0.6116402116402117, "grad_norm": 0.3096835013791637, "learning_rate": 4.425715405723246e-05, "loss": 0.5873, "step": 578 }, { "epoch": 0.6126984126984127, "grad_norm": 0.29052274812803913, "learning_rate": 4.423755390043121e-05, "loss": 0.6673, "step": 579 }, { "epoch": 0.6137566137566137, "grad_norm": 0.2726481497564029, "learning_rate": 4.421795374362995e-05, "loss": 0.5688, "step": 580 }, { "epoch": 0.6148148148148148, "grad_norm": 0.30868513597771813, "learning_rate": 4.419835358682869e-05, "loss": 0.5413, "step": 581 }, { "epoch": 0.6158730158730159, "grad_norm": 0.2931107306494861, "learning_rate": 4.417875343002744e-05, "loss": 0.6191, "step": 582 }, { "epoch": 0.6169312169312169, "grad_norm": 0.27359852790549766, "learning_rate": 4.415915327322619e-05, "loss": 0.5976, "step": 583 }, { "epoch": 0.617989417989418, "grad_norm": 0.300451446217763, "learning_rate": 4.4139553116424934e-05, "loss": 0.5958, "step": 584 }, { "epoch": 0.6190476190476191, "grad_norm": 0.2739263908226725, "learning_rate": 4.411995295962368e-05, "loss": 0.4819, "step": 585 }, { "epoch": 0.6201058201058202, "grad_norm": 0.3629370235736209, "learning_rate": 4.4100352802822426e-05, "loss": 0.5969, "step": 586 }, { "epoch": 0.6211640211640211, "grad_norm": 0.2546483909460872, "learning_rate": 4.4080752646021175e-05, "loss": 0.5637, "step": 587 }, { "epoch": 0.6222222222222222, "grad_norm": 0.2900274901311552, "learning_rate": 4.406115248921992e-05, "loss": 0.5981, "step": 588 }, { "epoch": 0.6232804232804233, "grad_norm": 0.30997705577212653, "learning_rate": 4.404155233241866e-05, "loss": 0.5763, "step": 589 }, { "epoch": 0.6243386243386243, "grad_norm": 0.27235932084700465, "learning_rate": 4.402195217561741e-05, "loss": 0.6027, "step": 590 }, { "epoch": 0.6253968253968254, "grad_norm": 0.27424029005807476, "learning_rate": 4.400235201881615e-05, "loss": 0.528, "step": 591 }, { "epoch": 0.6264550264550265, "grad_norm": 0.8024320046005814, "learning_rate": 4.39827518620149e-05, "loss": 0.5073, "step": 592 }, { "epoch": 0.6275132275132275, "grad_norm": 0.25330323543588595, "learning_rate": 4.3963151705213644e-05, "loss": 0.5903, "step": 593 }, { "epoch": 0.6285714285714286, "grad_norm": 0.2786349566048451, "learning_rate": 4.3943551548412386e-05, "loss": 0.5646, "step": 594 }, { "epoch": 0.6296296296296297, "grad_norm": 0.27111077124213456, "learning_rate": 4.3923951391611135e-05, "loss": 0.5166, "step": 595 }, { "epoch": 0.6306878306878307, "grad_norm": 0.2537581235144252, "learning_rate": 4.3904351234809885e-05, "loss": 0.5475, "step": 596 }, { "epoch": 0.6317460317460317, "grad_norm": 0.2940667859193826, "learning_rate": 4.388475107800862e-05, "loss": 0.5437, "step": 597 }, { "epoch": 0.6328042328042328, "grad_norm": 0.26691327126042796, "learning_rate": 4.386515092120737e-05, "loss": 0.5508, "step": 598 }, { "epoch": 0.6338624338624339, "grad_norm": 3.125987633190478, "learning_rate": 4.384555076440612e-05, "loss": 0.5517, "step": 599 }, { "epoch": 0.6349206349206349, "grad_norm": 0.3051459051560689, "learning_rate": 4.382595060760487e-05, "loss": 0.5032, "step": 600 }, { "epoch": 0.635978835978836, "grad_norm": 0.2820838021225074, "learning_rate": 4.3806350450803604e-05, "loss": 0.5407, "step": 601 }, { "epoch": 0.6370370370370371, "grad_norm": 1.3393856684804768, "learning_rate": 4.378675029400235e-05, "loss": 0.6108, "step": 602 }, { "epoch": 0.638095238095238, "grad_norm": 0.3228594095981327, "learning_rate": 4.37671501372011e-05, "loss": 0.5025, "step": 603 }, { "epoch": 0.6391534391534391, "grad_norm": 0.3351209221453693, "learning_rate": 4.3747549980399845e-05, "loss": 0.5659, "step": 604 }, { "epoch": 0.6402116402116402, "grad_norm": 0.31230131092186864, "learning_rate": 4.372794982359859e-05, "loss": 0.6007, "step": 605 }, { "epoch": 0.6412698412698413, "grad_norm": 0.2828867288582573, "learning_rate": 4.370834966679734e-05, "loss": 0.527, "step": 606 }, { "epoch": 0.6423280423280423, "grad_norm": 0.34529541782733586, "learning_rate": 4.368874950999608e-05, "loss": 0.5486, "step": 607 }, { "epoch": 0.6433862433862434, "grad_norm": 0.2951068794541986, "learning_rate": 4.366914935319483e-05, "loss": 0.5267, "step": 608 }, { "epoch": 0.6444444444444445, "grad_norm": 0.40815008679176695, "learning_rate": 4.364954919639358e-05, "loss": 0.5177, "step": 609 }, { "epoch": 0.6455026455026455, "grad_norm": 0.32045696467121443, "learning_rate": 4.362994903959231e-05, "loss": 0.5586, "step": 610 }, { "epoch": 0.6465608465608466, "grad_norm": 0.32938732342590915, "learning_rate": 4.361034888279106e-05, "loss": 0.5711, "step": 611 }, { "epoch": 0.6476190476190476, "grad_norm": 0.2914985462165065, "learning_rate": 4.359074872598981e-05, "loss": 0.6356, "step": 612 }, { "epoch": 0.6486772486772486, "grad_norm": 0.31957339334532875, "learning_rate": 4.357114856918856e-05, "loss": 0.5775, "step": 613 }, { "epoch": 0.6497354497354497, "grad_norm": 0.3380048318234734, "learning_rate": 4.35515484123873e-05, "loss": 0.5668, "step": 614 }, { "epoch": 0.6507936507936508, "grad_norm": 0.2677052222743914, "learning_rate": 4.3531948255586046e-05, "loss": 0.5867, "step": 615 }, { "epoch": 0.6518518518518519, "grad_norm": 0.2714050773104208, "learning_rate": 4.3512348098784795e-05, "loss": 0.569, "step": 616 }, { "epoch": 0.6529100529100529, "grad_norm": 0.322108580723527, "learning_rate": 4.349274794198354e-05, "loss": 0.5379, "step": 617 }, { "epoch": 0.653968253968254, "grad_norm": 0.2569245838676028, "learning_rate": 4.347314778518228e-05, "loss": 0.5118, "step": 618 }, { "epoch": 0.6550264550264551, "grad_norm": 0.2530435498061503, "learning_rate": 4.345354762838103e-05, "loss": 0.5139, "step": 619 }, { "epoch": 0.656084656084656, "grad_norm": 0.29585162017581107, "learning_rate": 4.343394747157977e-05, "loss": 0.4954, "step": 620 }, { "epoch": 0.6571428571428571, "grad_norm": 0.2924074793370025, "learning_rate": 4.341434731477852e-05, "loss": 0.6042, "step": 621 }, { "epoch": 0.6582010582010582, "grad_norm": 0.2855302831026707, "learning_rate": 4.3394747157977264e-05, "loss": 0.514, "step": 622 }, { "epoch": 0.6592592592592592, "grad_norm": 0.7366169291367959, "learning_rate": 4.337514700117601e-05, "loss": 0.4895, "step": 623 }, { "epoch": 0.6603174603174603, "grad_norm": 0.27063474014038635, "learning_rate": 4.3355546844374756e-05, "loss": 0.5592, "step": 624 }, { "epoch": 0.6613756613756614, "grad_norm": 0.2539651508587892, "learning_rate": 4.3335946687573505e-05, "loss": 0.5301, "step": 625 }, { "epoch": 0.6624338624338625, "grad_norm": 0.2458381741039525, "learning_rate": 4.331634653077225e-05, "loss": 0.5762, "step": 626 }, { "epoch": 0.6634920634920635, "grad_norm": 0.25275919533490976, "learning_rate": 4.329674637397099e-05, "loss": 0.5422, "step": 627 }, { "epoch": 0.6645502645502646, "grad_norm": 0.2626626352002252, "learning_rate": 4.327714621716974e-05, "loss": 0.5175, "step": 628 }, { "epoch": 0.6656084656084656, "grad_norm": 0.2646967802621259, "learning_rate": 4.325754606036849e-05, "loss": 0.4619, "step": 629 }, { "epoch": 0.6666666666666666, "grad_norm": 0.24368769535748608, "learning_rate": 4.323794590356723e-05, "loss": 0.5964, "step": 630 }, { "epoch": 0.6677248677248677, "grad_norm": 0.2959718652590851, "learning_rate": 4.321834574676597e-05, "loss": 0.5691, "step": 631 }, { "epoch": 0.6687830687830688, "grad_norm": 0.2871293046686386, "learning_rate": 4.319874558996472e-05, "loss": 0.5696, "step": 632 }, { "epoch": 0.6698412698412698, "grad_norm": 0.2905598264986975, "learning_rate": 4.3179145433163465e-05, "loss": 0.629, "step": 633 }, { "epoch": 0.6708994708994709, "grad_norm": 0.2528097775183016, "learning_rate": 4.3159545276362214e-05, "loss": 0.4849, "step": 634 }, { "epoch": 0.671957671957672, "grad_norm": 0.33621938773228494, "learning_rate": 4.313994511956096e-05, "loss": 0.5879, "step": 635 }, { "epoch": 0.6730158730158731, "grad_norm": 0.2394966771033253, "learning_rate": 4.3120344962759706e-05, "loss": 0.439, "step": 636 }, { "epoch": 0.674074074074074, "grad_norm": 0.3150605848513219, "learning_rate": 4.310074480595845e-05, "loss": 0.5989, "step": 637 }, { "epoch": 0.6751322751322751, "grad_norm": 0.2981707132882546, "learning_rate": 4.30811446491572e-05, "loss": 0.6225, "step": 638 }, { "epoch": 0.6761904761904762, "grad_norm": 0.30492112978736213, "learning_rate": 4.306154449235594e-05, "loss": 0.5404, "step": 639 }, { "epoch": 0.6772486772486772, "grad_norm": 0.2814127526303401, "learning_rate": 4.304194433555468e-05, "loss": 0.5233, "step": 640 }, { "epoch": 0.6783068783068783, "grad_norm": 0.30173716166671577, "learning_rate": 4.302234417875343e-05, "loss": 0.6427, "step": 641 }, { "epoch": 0.6793650793650794, "grad_norm": 0.25137410162980395, "learning_rate": 4.300274402195218e-05, "loss": 0.5607, "step": 642 }, { "epoch": 0.6804232804232804, "grad_norm": 0.30267125046888244, "learning_rate": 4.2983143865150924e-05, "loss": 0.6308, "step": 643 }, { "epoch": 0.6814814814814815, "grad_norm": 0.2570686438542522, "learning_rate": 4.2963543708349666e-05, "loss": 0.5708, "step": 644 }, { "epoch": 0.6825396825396826, "grad_norm": 0.47196520519510343, "learning_rate": 4.2943943551548416e-05, "loss": 0.4772, "step": 645 }, { "epoch": 0.6835978835978836, "grad_norm": 0.2733048973472466, "learning_rate": 4.292434339474716e-05, "loss": 0.5569, "step": 646 }, { "epoch": 0.6846560846560846, "grad_norm": 0.29886485226274184, "learning_rate": 4.290474323794591e-05, "loss": 0.5785, "step": 647 }, { "epoch": 0.6857142857142857, "grad_norm": 0.2403402937990743, "learning_rate": 4.288514308114465e-05, "loss": 0.5595, "step": 648 }, { "epoch": 0.6867724867724868, "grad_norm": 0.2794008545700776, "learning_rate": 4.28655429243434e-05, "loss": 0.5396, "step": 649 }, { "epoch": 0.6878306878306878, "grad_norm": 0.3041316437038585, "learning_rate": 4.284594276754214e-05, "loss": 0.5601, "step": 650 }, { "epoch": 0.6888888888888889, "grad_norm": 0.24855481366370547, "learning_rate": 4.282634261074089e-05, "loss": 0.5405, "step": 651 }, { "epoch": 0.68994708994709, "grad_norm": 0.3040097458165702, "learning_rate": 4.280674245393963e-05, "loss": 0.64, "step": 652 }, { "epoch": 0.691005291005291, "grad_norm": 0.3069869635860471, "learning_rate": 4.2787142297138376e-05, "loss": 0.4583, "step": 653 }, { "epoch": 0.692063492063492, "grad_norm": 0.2936008162342032, "learning_rate": 4.2767542140337125e-05, "loss": 0.5027, "step": 654 }, { "epoch": 0.6931216931216931, "grad_norm": 0.25492661711077735, "learning_rate": 4.2747941983535874e-05, "loss": 0.5225, "step": 655 }, { "epoch": 0.6941798941798942, "grad_norm": 2.5144220055291275, "learning_rate": 4.272834182673462e-05, "loss": 0.6846, "step": 656 }, { "epoch": 0.6952380952380952, "grad_norm": 0.28990557153471347, "learning_rate": 4.270874166993336e-05, "loss": 0.5752, "step": 657 }, { "epoch": 0.6962962962962963, "grad_norm": 0.26542534568407355, "learning_rate": 4.268914151313211e-05, "loss": 0.4701, "step": 658 }, { "epoch": 0.6973544973544974, "grad_norm": 0.3193855535773065, "learning_rate": 4.266954135633085e-05, "loss": 0.4586, "step": 659 }, { "epoch": 0.6984126984126984, "grad_norm": 0.27477559170920174, "learning_rate": 4.26499411995296e-05, "loss": 0.5044, "step": 660 }, { "epoch": 0.6994708994708995, "grad_norm": 0.2691283100685374, "learning_rate": 4.263034104272834e-05, "loss": 0.6087, "step": 661 }, { "epoch": 0.7005291005291006, "grad_norm": 0.3830270820991914, "learning_rate": 4.261074088592709e-05, "loss": 0.5775, "step": 662 }, { "epoch": 0.7015873015873015, "grad_norm": 0.2752894226111952, "learning_rate": 4.2591140729125835e-05, "loss": 0.5079, "step": 663 }, { "epoch": 0.7026455026455026, "grad_norm": 0.28397687271151834, "learning_rate": 4.2571540572324584e-05, "loss": 0.5804, "step": 664 }, { "epoch": 0.7037037037037037, "grad_norm": 3.367695352363058, "learning_rate": 4.2551940415523326e-05, "loss": 0.7196, "step": 665 }, { "epoch": 0.7047619047619048, "grad_norm": 0.38941063877467563, "learning_rate": 4.253234025872207e-05, "loss": 0.5676, "step": 666 }, { "epoch": 0.7058201058201058, "grad_norm": 0.299525285575676, "learning_rate": 4.251274010192082e-05, "loss": 0.5809, "step": 667 }, { "epoch": 0.7068783068783069, "grad_norm": 0.24410082905969419, "learning_rate": 4.249313994511957e-05, "loss": 0.5784, "step": 668 }, { "epoch": 0.707936507936508, "grad_norm": 0.817819426083752, "learning_rate": 4.24735397883183e-05, "loss": 0.5015, "step": 669 }, { "epoch": 0.708994708994709, "grad_norm": 0.35760232367472883, "learning_rate": 4.245393963151705e-05, "loss": 0.6156, "step": 670 }, { "epoch": 0.71005291005291, "grad_norm": 0.2827406205547459, "learning_rate": 4.24343394747158e-05, "loss": 0.5757, "step": 671 }, { "epoch": 0.7111111111111111, "grad_norm": 0.28008993326048687, "learning_rate": 4.241473931791455e-05, "loss": 0.5112, "step": 672 }, { "epoch": 0.7121693121693121, "grad_norm": 0.2924001411666532, "learning_rate": 4.2395139161113287e-05, "loss": 0.5523, "step": 673 }, { "epoch": 0.7132275132275132, "grad_norm": 0.27305443841057714, "learning_rate": 4.2375539004312036e-05, "loss": 0.5126, "step": 674 }, { "epoch": 0.7142857142857143, "grad_norm": 0.2736723087303262, "learning_rate": 4.2355938847510785e-05, "loss": 0.5388, "step": 675 }, { "epoch": 0.7153439153439154, "grad_norm": 0.27719351183152063, "learning_rate": 4.233633869070953e-05, "loss": 0.5331, "step": 676 }, { "epoch": 0.7164021164021164, "grad_norm": 0.29024073675556417, "learning_rate": 4.231673853390827e-05, "loss": 0.5324, "step": 677 }, { "epoch": 0.7174603174603175, "grad_norm": 0.28459105874291735, "learning_rate": 4.229713837710702e-05, "loss": 0.5723, "step": 678 }, { "epoch": 0.7185185185185186, "grad_norm": 0.27008204854468826, "learning_rate": 4.227753822030576e-05, "loss": 0.5183, "step": 679 }, { "epoch": 0.7195767195767195, "grad_norm": 0.32531200458934517, "learning_rate": 4.225793806350451e-05, "loss": 0.5417, "step": 680 }, { "epoch": 0.7206349206349206, "grad_norm": 0.2827590448071415, "learning_rate": 4.223833790670326e-05, "loss": 0.5688, "step": 681 }, { "epoch": 0.7216931216931217, "grad_norm": 0.32843041348454105, "learning_rate": 4.2218737749901996e-05, "loss": 0.4738, "step": 682 }, { "epoch": 0.7227513227513227, "grad_norm": 0.29203603535869277, "learning_rate": 4.2199137593100745e-05, "loss": 0.571, "step": 683 }, { "epoch": 0.7238095238095238, "grad_norm": 0.26235624444883343, "learning_rate": 4.2179537436299495e-05, "loss": 0.5339, "step": 684 }, { "epoch": 0.7248677248677249, "grad_norm": 0.2743770991741004, "learning_rate": 4.2159937279498244e-05, "loss": 0.5938, "step": 685 }, { "epoch": 0.725925925925926, "grad_norm": 0.28706543540832447, "learning_rate": 4.214033712269698e-05, "loss": 0.5088, "step": 686 }, { "epoch": 0.726984126984127, "grad_norm": 0.2831332896571073, "learning_rate": 4.212073696589573e-05, "loss": 0.606, "step": 687 }, { "epoch": 0.728042328042328, "grad_norm": 0.262186187511753, "learning_rate": 4.210113680909448e-05, "loss": 0.5341, "step": 688 }, { "epoch": 0.7291005291005291, "grad_norm": 0.3096610015904226, "learning_rate": 4.208153665229322e-05, "loss": 0.5336, "step": 689 }, { "epoch": 0.7301587301587301, "grad_norm": 0.2733332314767257, "learning_rate": 4.206193649549196e-05, "loss": 0.5145, "step": 690 }, { "epoch": 0.7312169312169312, "grad_norm": 0.2438987308317826, "learning_rate": 4.204233633869071e-05, "loss": 0.5574, "step": 691 }, { "epoch": 0.7322751322751323, "grad_norm": 0.29975311338331406, "learning_rate": 4.2022736181889455e-05, "loss": 0.5676, "step": 692 }, { "epoch": 0.7333333333333333, "grad_norm": 0.3159755996646559, "learning_rate": 4.2003136025088204e-05, "loss": 0.6154, "step": 693 }, { "epoch": 0.7343915343915344, "grad_norm": 0.24416951903546183, "learning_rate": 4.1983535868286947e-05, "loss": 0.5248, "step": 694 }, { "epoch": 0.7354497354497355, "grad_norm": 0.24947362261992206, "learning_rate": 4.196393571148569e-05, "loss": 0.5121, "step": 695 }, { "epoch": 0.7365079365079366, "grad_norm": 0.30381658421133123, "learning_rate": 4.194433555468444e-05, "loss": 0.583, "step": 696 }, { "epoch": 0.7375661375661375, "grad_norm": 0.23096151371713622, "learning_rate": 4.192473539788319e-05, "loss": 0.5245, "step": 697 }, { "epoch": 0.7386243386243386, "grad_norm": 0.3162192673911441, "learning_rate": 4.190513524108193e-05, "loss": 0.5468, "step": 698 }, { "epoch": 0.7396825396825397, "grad_norm": 0.25182522028970394, "learning_rate": 4.188553508428067e-05, "loss": 0.597, "step": 699 }, { "epoch": 0.7407407407407407, "grad_norm": 0.2684334525549805, "learning_rate": 4.186593492747942e-05, "loss": 0.576, "step": 700 }, { "epoch": 0.7417989417989418, "grad_norm": 1.523561308125903, "learning_rate": 4.184633477067817e-05, "loss": 0.5329, "step": 701 }, { "epoch": 0.7428571428571429, "grad_norm": 0.28565441595265484, "learning_rate": 4.1826734613876914e-05, "loss": 0.5196, "step": 702 }, { "epoch": 0.7439153439153439, "grad_norm": 0.2888862478367132, "learning_rate": 4.1807134457075656e-05, "loss": 0.6255, "step": 703 }, { "epoch": 0.744973544973545, "grad_norm": 0.3007510062268032, "learning_rate": 4.1787534300274405e-05, "loss": 0.5565, "step": 704 }, { "epoch": 0.746031746031746, "grad_norm": 0.27427995824764073, "learning_rate": 4.176793414347315e-05, "loss": 0.5533, "step": 705 }, { "epoch": 0.7470899470899471, "grad_norm": 0.24178542578986884, "learning_rate": 4.17483339866719e-05, "loss": 0.4839, "step": 706 }, { "epoch": 0.7481481481481481, "grad_norm": 0.3447762627838107, "learning_rate": 4.172873382987064e-05, "loss": 0.5418, "step": 707 }, { "epoch": 0.7492063492063492, "grad_norm": 0.28512746124111127, "learning_rate": 4.170913367306938e-05, "loss": 0.5593, "step": 708 }, { "epoch": 0.7502645502645503, "grad_norm": 0.25654137696146834, "learning_rate": 4.168953351626813e-05, "loss": 0.5308, "step": 709 }, { "epoch": 0.7513227513227513, "grad_norm": 0.3162447256848906, "learning_rate": 4.166993335946688e-05, "loss": 0.5092, "step": 710 }, { "epoch": 0.7523809523809524, "grad_norm": 0.2706709395661285, "learning_rate": 4.165033320266562e-05, "loss": 0.549, "step": 711 }, { "epoch": 0.7534391534391535, "grad_norm": 0.27951783492610943, "learning_rate": 4.1630733045864365e-05, "loss": 0.5039, "step": 712 }, { "epoch": 0.7544973544973544, "grad_norm": 0.34542932566254875, "learning_rate": 4.1611132889063115e-05, "loss": 0.5269, "step": 713 }, { "epoch": 0.7555555555555555, "grad_norm": 0.2666243377680122, "learning_rate": 4.1591532732261864e-05, "loss": 0.5833, "step": 714 }, { "epoch": 0.7566137566137566, "grad_norm": 0.3245207820226863, "learning_rate": 4.1571932575460606e-05, "loss": 0.5764, "step": 715 }, { "epoch": 0.7576719576719577, "grad_norm": 0.26147609233769936, "learning_rate": 4.155233241865935e-05, "loss": 0.5527, "step": 716 }, { "epoch": 0.7587301587301587, "grad_norm": 0.2584431704263216, "learning_rate": 4.15327322618581e-05, "loss": 0.5637, "step": 717 }, { "epoch": 0.7597883597883598, "grad_norm": 0.2731600454527648, "learning_rate": 4.151313210505684e-05, "loss": 0.6257, "step": 718 }, { "epoch": 0.7608465608465609, "grad_norm": 0.2430610169832162, "learning_rate": 4.149353194825559e-05, "loss": 0.5118, "step": 719 }, { "epoch": 0.7619047619047619, "grad_norm": 0.46705609114743374, "learning_rate": 4.147393179145433e-05, "loss": 0.5657, "step": 720 }, { "epoch": 0.762962962962963, "grad_norm": 0.2765619748725685, "learning_rate": 4.145433163465308e-05, "loss": 0.5653, "step": 721 }, { "epoch": 0.764021164021164, "grad_norm": 0.2445879143109297, "learning_rate": 4.1434731477851824e-05, "loss": 0.5549, "step": 722 }, { "epoch": 0.765079365079365, "grad_norm": 0.29444717176229945, "learning_rate": 4.1415131321050573e-05, "loss": 0.5159, "step": 723 }, { "epoch": 0.7661375661375661, "grad_norm": 0.23188418245940617, "learning_rate": 4.1395531164249316e-05, "loss": 0.5596, "step": 724 }, { "epoch": 0.7671957671957672, "grad_norm": 0.2726530916672444, "learning_rate": 4.137593100744806e-05, "loss": 0.5596, "step": 725 }, { "epoch": 0.7682539682539683, "grad_norm": 0.26947956374326953, "learning_rate": 4.135633085064681e-05, "loss": 0.5813, "step": 726 }, { "epoch": 0.7693121693121693, "grad_norm": 0.23506890162797972, "learning_rate": 4.133673069384556e-05, "loss": 0.5437, "step": 727 }, { "epoch": 0.7703703703703704, "grad_norm": 2.972237291768401, "learning_rate": 4.13171305370443e-05, "loss": 0.5998, "step": 728 }, { "epoch": 0.7714285714285715, "grad_norm": 0.27441250036308384, "learning_rate": 4.129753038024304e-05, "loss": 0.5271, "step": 729 }, { "epoch": 0.7724867724867724, "grad_norm": 0.2522029487623296, "learning_rate": 4.127793022344179e-05, "loss": 0.5339, "step": 730 }, { "epoch": 0.7735449735449735, "grad_norm": 0.25217726436063065, "learning_rate": 4.1258330066640534e-05, "loss": 0.578, "step": 731 }, { "epoch": 0.7746031746031746, "grad_norm": 0.25344645901788887, "learning_rate": 4.123872990983928e-05, "loss": 0.5418, "step": 732 }, { "epoch": 0.7756613756613756, "grad_norm": 0.2396746178638489, "learning_rate": 4.1219129753038025e-05, "loss": 0.5244, "step": 733 }, { "epoch": 0.7767195767195767, "grad_norm": 0.31881877046038376, "learning_rate": 4.1199529596236775e-05, "loss": 0.5785, "step": 734 }, { "epoch": 0.7777777777777778, "grad_norm": 0.23060282015738198, "learning_rate": 4.117992943943552e-05, "loss": 0.4795, "step": 735 }, { "epoch": 0.7788359788359789, "grad_norm": 0.26057461199701626, "learning_rate": 4.1160329282634266e-05, "loss": 0.4967, "step": 736 }, { "epoch": 0.7798941798941799, "grad_norm": 0.24273599035257443, "learning_rate": 4.114072912583301e-05, "loss": 0.5449, "step": 737 }, { "epoch": 0.780952380952381, "grad_norm": 0.23318919061727453, "learning_rate": 4.112112896903175e-05, "loss": 0.4872, "step": 738 }, { "epoch": 0.782010582010582, "grad_norm": 0.28999779048722424, "learning_rate": 4.11015288122305e-05, "loss": 0.6129, "step": 739 }, { "epoch": 0.783068783068783, "grad_norm": 0.25705614998142, "learning_rate": 4.108192865542925e-05, "loss": 0.5755, "step": 740 }, { "epoch": 0.7841269841269841, "grad_norm": 0.24516780262995658, "learning_rate": 4.1062328498627986e-05, "loss": 0.5313, "step": 741 }, { "epoch": 0.7851851851851852, "grad_norm": 0.2632105354730925, "learning_rate": 4.1042728341826735e-05, "loss": 0.4978, "step": 742 }, { "epoch": 0.7862433862433862, "grad_norm": 0.2950535479592732, "learning_rate": 4.1023128185025484e-05, "loss": 0.5477, "step": 743 }, { "epoch": 0.7873015873015873, "grad_norm": 0.2890918764180236, "learning_rate": 4.100352802822423e-05, "loss": 0.6027, "step": 744 }, { "epoch": 0.7883597883597884, "grad_norm": 0.24905516173631076, "learning_rate": 4.098392787142297e-05, "loss": 0.4916, "step": 745 }, { "epoch": 0.7894179894179895, "grad_norm": 0.30918231275089586, "learning_rate": 4.096432771462172e-05, "loss": 0.5521, "step": 746 }, { "epoch": 0.7904761904761904, "grad_norm": 0.254678981001304, "learning_rate": 4.094472755782047e-05, "loss": 0.5693, "step": 747 }, { "epoch": 0.7915343915343915, "grad_norm": 0.24323586920042417, "learning_rate": 4.092512740101921e-05, "loss": 0.5497, "step": 748 }, { "epoch": 0.7925925925925926, "grad_norm": 0.2888285005504316, "learning_rate": 4.090552724421795e-05, "loss": 0.4983, "step": 749 }, { "epoch": 0.7936507936507936, "grad_norm": 0.259386381669663, "learning_rate": 4.08859270874167e-05, "loss": 0.6195, "step": 750 }, { "epoch": 0.7947089947089947, "grad_norm": 0.23036461293567234, "learning_rate": 4.0866326930615444e-05, "loss": 0.5492, "step": 751 }, { "epoch": 0.7957671957671958, "grad_norm": 0.2678855648948368, "learning_rate": 4.0846726773814194e-05, "loss": 0.5072, "step": 752 }, { "epoch": 0.7968253968253968, "grad_norm": 0.2887180222360751, "learning_rate": 4.082712661701294e-05, "loss": 0.5246, "step": 753 }, { "epoch": 0.7978835978835979, "grad_norm": 0.2568292309960633, "learning_rate": 4.080752646021168e-05, "loss": 0.5184, "step": 754 }, { "epoch": 0.798941798941799, "grad_norm": 0.24711251400823, "learning_rate": 4.078792630341043e-05, "loss": 0.5341, "step": 755 }, { "epoch": 0.8, "grad_norm": 0.31036770266770836, "learning_rate": 4.076832614660918e-05, "loss": 0.6273, "step": 756 }, { "epoch": 0.801058201058201, "grad_norm": 0.2571757220271915, "learning_rate": 4.074872598980792e-05, "loss": 0.5696, "step": 757 }, { "epoch": 0.8021164021164021, "grad_norm": 0.26980125079686357, "learning_rate": 4.072912583300666e-05, "loss": 0.5467, "step": 758 }, { "epoch": 0.8031746031746032, "grad_norm": 0.24951832038119673, "learning_rate": 4.070952567620541e-05, "loss": 0.5083, "step": 759 }, { "epoch": 0.8042328042328042, "grad_norm": 0.25244068781335705, "learning_rate": 4.068992551940416e-05, "loss": 0.4868, "step": 760 }, { "epoch": 0.8052910052910053, "grad_norm": 0.24075958410510967, "learning_rate": 4.06703253626029e-05, "loss": 0.5132, "step": 761 }, { "epoch": 0.8063492063492064, "grad_norm": 0.299818761325055, "learning_rate": 4.0650725205801646e-05, "loss": 0.5085, "step": 762 }, { "epoch": 0.8074074074074075, "grad_norm": 0.2556890383489905, "learning_rate": 4.0631125049000395e-05, "loss": 0.6116, "step": 763 }, { "epoch": 0.8084656084656084, "grad_norm": 0.25781514459024585, "learning_rate": 4.061152489219914e-05, "loss": 0.5138, "step": 764 }, { "epoch": 0.8095238095238095, "grad_norm": 0.4812654512833488, "learning_rate": 4.059192473539789e-05, "loss": 0.5117, "step": 765 }, { "epoch": 0.8105820105820106, "grad_norm": 0.43995941221107904, "learning_rate": 4.057232457859663e-05, "loss": 0.523, "step": 766 }, { "epoch": 0.8116402116402116, "grad_norm": 0.2570086400263589, "learning_rate": 4.055272442179537e-05, "loss": 0.4922, "step": 767 }, { "epoch": 0.8126984126984127, "grad_norm": 0.27355262696697646, "learning_rate": 4.053312426499412e-05, "loss": 0.5214, "step": 768 }, { "epoch": 0.8137566137566138, "grad_norm": 0.30929404940417793, "learning_rate": 4.051352410819287e-05, "loss": 0.5595, "step": 769 }, { "epoch": 0.8148148148148148, "grad_norm": 0.25651422442360594, "learning_rate": 4.049392395139161e-05, "loss": 0.5977, "step": 770 }, { "epoch": 0.8158730158730159, "grad_norm": 0.3254465197758939, "learning_rate": 4.0474323794590355e-05, "loss": 0.5534, "step": 771 }, { "epoch": 0.816931216931217, "grad_norm": 0.2859692925402336, "learning_rate": 4.0454723637789104e-05, "loss": 0.56, "step": 772 }, { "epoch": 0.817989417989418, "grad_norm": 0.24530950245641758, "learning_rate": 4.0435123480987854e-05, "loss": 0.4727, "step": 773 }, { "epoch": 0.819047619047619, "grad_norm": 0.2852863920952454, "learning_rate": 4.0415523324186596e-05, "loss": 0.5166, "step": 774 }, { "epoch": 0.8201058201058201, "grad_norm": 0.28797276683526324, "learning_rate": 4.039592316738534e-05, "loss": 0.6108, "step": 775 }, { "epoch": 0.8211640211640212, "grad_norm": 0.24315570228929462, "learning_rate": 4.037632301058409e-05, "loss": 0.5383, "step": 776 }, { "epoch": 0.8222222222222222, "grad_norm": 0.27806729681229697, "learning_rate": 4.035672285378283e-05, "loss": 0.5445, "step": 777 }, { "epoch": 0.8232804232804233, "grad_norm": 0.27598442145764895, "learning_rate": 4.033712269698158e-05, "loss": 0.4656, "step": 778 }, { "epoch": 0.8243386243386244, "grad_norm": 0.22788260383702866, "learning_rate": 4.031752254018032e-05, "loss": 0.4989, "step": 779 }, { "epoch": 0.8253968253968254, "grad_norm": 0.2799668545860428, "learning_rate": 4.0297922383379065e-05, "loss": 0.6928, "step": 780 }, { "epoch": 0.8264550264550264, "grad_norm": 0.2393669331758126, "learning_rate": 4.0278322226577814e-05, "loss": 0.4514, "step": 781 }, { "epoch": 0.8275132275132275, "grad_norm": 0.2625212858096316, "learning_rate": 4.025872206977656e-05, "loss": 0.5826, "step": 782 }, { "epoch": 0.8285714285714286, "grad_norm": 0.28090132618519864, "learning_rate": 4.0239121912975306e-05, "loss": 0.5763, "step": 783 }, { "epoch": 0.8296296296296296, "grad_norm": 0.254545648971496, "learning_rate": 4.021952175617405e-05, "loss": 0.5603, "step": 784 }, { "epoch": 0.8306878306878307, "grad_norm": 0.3026212789118765, "learning_rate": 4.01999215993728e-05, "loss": 0.5572, "step": 785 }, { "epoch": 0.8317460317460318, "grad_norm": 0.2583084756689895, "learning_rate": 4.018032144257155e-05, "loss": 0.5687, "step": 786 }, { "epoch": 0.8328042328042328, "grad_norm": 0.2912070717602156, "learning_rate": 4.016072128577029e-05, "loss": 0.4976, "step": 787 }, { "epoch": 0.8338624338624339, "grad_norm": 0.3091276282835055, "learning_rate": 4.014112112896903e-05, "loss": 0.602, "step": 788 }, { "epoch": 0.834920634920635, "grad_norm": 0.2614685006059475, "learning_rate": 4.012152097216778e-05, "loss": 0.6047, "step": 789 }, { "epoch": 0.8359788359788359, "grad_norm": 0.2976678306406433, "learning_rate": 4.010192081536652e-05, "loss": 0.5572, "step": 790 }, { "epoch": 0.837037037037037, "grad_norm": 0.26695480665010896, "learning_rate": 4.008232065856527e-05, "loss": 0.5368, "step": 791 }, { "epoch": 0.8380952380952381, "grad_norm": 0.27224551418357, "learning_rate": 4.0062720501764015e-05, "loss": 0.5715, "step": 792 }, { "epoch": 0.8391534391534392, "grad_norm": 0.24476160512891385, "learning_rate": 4.004312034496276e-05, "loss": 0.5734, "step": 793 }, { "epoch": 0.8402116402116402, "grad_norm": 0.26511024491224705, "learning_rate": 4.002352018816151e-05, "loss": 0.5795, "step": 794 }, { "epoch": 0.8412698412698413, "grad_norm": 0.34773604650933276, "learning_rate": 4.0003920031360256e-05, "loss": 0.5005, "step": 795 }, { "epoch": 0.8423280423280424, "grad_norm": 0.2615336683961906, "learning_rate": 3.9984319874559e-05, "loss": 0.5364, "step": 796 }, { "epoch": 0.8433862433862434, "grad_norm": 0.2739470851171991, "learning_rate": 3.996471971775774e-05, "loss": 0.4933, "step": 797 }, { "epoch": 0.8444444444444444, "grad_norm": 0.3482221259725657, "learning_rate": 3.994511956095649e-05, "loss": 0.6087, "step": 798 }, { "epoch": 0.8455026455026455, "grad_norm": 7.054547078978019, "learning_rate": 3.992551940415524e-05, "loss": 0.5531, "step": 799 }, { "epoch": 0.8465608465608465, "grad_norm": 0.3025752493200541, "learning_rate": 3.990591924735398e-05, "loss": 0.5327, "step": 800 }, { "epoch": 0.8476190476190476, "grad_norm": 0.2515881041243724, "learning_rate": 3.9886319090552725e-05, "loss": 0.5089, "step": 801 }, { "epoch": 0.8486772486772487, "grad_norm": 0.29447997518392294, "learning_rate": 3.9866718933751474e-05, "loss": 0.5635, "step": 802 }, { "epoch": 0.8497354497354498, "grad_norm": 1.2056063054042254, "learning_rate": 3.9847118776950216e-05, "loss": 0.568, "step": 803 }, { "epoch": 0.8507936507936508, "grad_norm": 0.24867885592706862, "learning_rate": 3.9827518620148966e-05, "loss": 0.5172, "step": 804 }, { "epoch": 0.8518518518518519, "grad_norm": 0.2833308927773852, "learning_rate": 3.980791846334771e-05, "loss": 0.5552, "step": 805 }, { "epoch": 0.852910052910053, "grad_norm": 0.43186449303900065, "learning_rate": 3.978831830654645e-05, "loss": 0.4626, "step": 806 }, { "epoch": 0.8539682539682539, "grad_norm": 0.28455060749504574, "learning_rate": 3.97687181497452e-05, "loss": 0.5228, "step": 807 }, { "epoch": 0.855026455026455, "grad_norm": 0.29552626608893695, "learning_rate": 3.974911799294395e-05, "loss": 0.5452, "step": 808 }, { "epoch": 0.8560846560846561, "grad_norm": 0.24084287912602692, "learning_rate": 3.972951783614269e-05, "loss": 0.54, "step": 809 }, { "epoch": 0.8571428571428571, "grad_norm": 0.28708908508185177, "learning_rate": 3.9709917679341434e-05, "loss": 0.524, "step": 810 }, { "epoch": 0.8582010582010582, "grad_norm": 0.26965168845733906, "learning_rate": 3.969031752254018e-05, "loss": 0.5084, "step": 811 }, { "epoch": 0.8592592592592593, "grad_norm": 0.27200841727390407, "learning_rate": 3.967071736573893e-05, "loss": 0.6633, "step": 812 }, { "epoch": 0.8603174603174604, "grad_norm": 0.4577420835451772, "learning_rate": 3.965111720893767e-05, "loss": 0.5984, "step": 813 }, { "epoch": 0.8613756613756614, "grad_norm": 0.23226446549107316, "learning_rate": 3.963151705213642e-05, "loss": 0.5383, "step": 814 }, { "epoch": 0.8624338624338624, "grad_norm": 0.24732625840438446, "learning_rate": 3.961191689533517e-05, "loss": 0.4663, "step": 815 }, { "epoch": 0.8634920634920635, "grad_norm": 0.2442585038196715, "learning_rate": 3.959231673853391e-05, "loss": 0.4915, "step": 816 }, { "epoch": 0.8645502645502645, "grad_norm": 0.24133270544480462, "learning_rate": 3.957271658173265e-05, "loss": 0.5099, "step": 817 }, { "epoch": 0.8656084656084656, "grad_norm": 0.2375341297040655, "learning_rate": 3.95531164249314e-05, "loss": 0.4853, "step": 818 }, { "epoch": 0.8666666666666667, "grad_norm": 0.2374457163459558, "learning_rate": 3.953351626813015e-05, "loss": 0.497, "step": 819 }, { "epoch": 0.8677248677248677, "grad_norm": 0.2944923584520418, "learning_rate": 3.951391611132889e-05, "loss": 0.5841, "step": 820 }, { "epoch": 0.8687830687830688, "grad_norm": 0.3103954396715373, "learning_rate": 3.9494315954527635e-05, "loss": 0.6308, "step": 821 }, { "epoch": 0.8698412698412699, "grad_norm": 0.3044635700570842, "learning_rate": 3.9474715797726385e-05, "loss": 0.6198, "step": 822 }, { "epoch": 0.870899470899471, "grad_norm": 0.25175807643491943, "learning_rate": 3.945511564092513e-05, "loss": 0.5508, "step": 823 }, { "epoch": 0.8719576719576719, "grad_norm": 0.22937033545757074, "learning_rate": 3.9435515484123876e-05, "loss": 0.5033, "step": 824 }, { "epoch": 0.873015873015873, "grad_norm": 0.2414211430197171, "learning_rate": 3.9415915327322626e-05, "loss": 0.5281, "step": 825 }, { "epoch": 0.8740740740740741, "grad_norm": 0.36303292523703895, "learning_rate": 3.939631517052136e-05, "loss": 0.4688, "step": 826 }, { "epoch": 0.8751322751322751, "grad_norm": 0.2840607840231142, "learning_rate": 3.937671501372011e-05, "loss": 0.5765, "step": 827 }, { "epoch": 0.8761904761904762, "grad_norm": 0.26470574352059545, "learning_rate": 3.935711485691886e-05, "loss": 0.5827, "step": 828 }, { "epoch": 0.8772486772486773, "grad_norm": 0.23448435580139299, "learning_rate": 3.93375147001176e-05, "loss": 0.4977, "step": 829 }, { "epoch": 0.8783068783068783, "grad_norm": 0.2635026772188355, "learning_rate": 3.9317914543316345e-05, "loss": 0.521, "step": 830 }, { "epoch": 0.8793650793650793, "grad_norm": 0.26987056951962557, "learning_rate": 3.9298314386515094e-05, "loss": 0.5562, "step": 831 }, { "epoch": 0.8804232804232804, "grad_norm": 0.2365896896674794, "learning_rate": 3.927871422971384e-05, "loss": 0.5206, "step": 832 }, { "epoch": 0.8814814814814815, "grad_norm": 0.2617025654901415, "learning_rate": 3.9259114072912586e-05, "loss": 0.53, "step": 833 }, { "epoch": 0.8825396825396825, "grad_norm": 0.24319463532557173, "learning_rate": 3.923951391611133e-05, "loss": 0.5784, "step": 834 }, { "epoch": 0.8835978835978836, "grad_norm": 0.30018809738193136, "learning_rate": 3.921991375931008e-05, "loss": 0.5594, "step": 835 }, { "epoch": 0.8846560846560847, "grad_norm": 0.24564159380840794, "learning_rate": 3.920031360250882e-05, "loss": 0.5678, "step": 836 }, { "epoch": 0.8857142857142857, "grad_norm": 0.24469707503949964, "learning_rate": 3.918071344570757e-05, "loss": 0.6056, "step": 837 }, { "epoch": 0.8867724867724868, "grad_norm": 0.28670136066959284, "learning_rate": 3.916111328890631e-05, "loss": 0.503, "step": 838 }, { "epoch": 0.8878306878306879, "grad_norm": 0.25036374517538473, "learning_rate": 3.9141513132105054e-05, "loss": 0.536, "step": 839 }, { "epoch": 0.8888888888888888, "grad_norm": 0.22936770624952652, "learning_rate": 3.9121912975303804e-05, "loss": 0.4632, "step": 840 }, { "epoch": 0.8899470899470899, "grad_norm": 0.23888772118569762, "learning_rate": 3.910231281850255e-05, "loss": 0.539, "step": 841 }, { "epoch": 0.891005291005291, "grad_norm": 0.26618130929626477, "learning_rate": 3.9082712661701295e-05, "loss": 0.5135, "step": 842 }, { "epoch": 0.8920634920634921, "grad_norm": 0.23048542050631554, "learning_rate": 3.906311250490004e-05, "loss": 0.4616, "step": 843 }, { "epoch": 0.8931216931216931, "grad_norm": 0.23846574548002752, "learning_rate": 3.904351234809879e-05, "loss": 0.56, "step": 844 }, { "epoch": 0.8941798941798942, "grad_norm": 0.25578679656745784, "learning_rate": 3.9023912191297536e-05, "loss": 0.5171, "step": 845 }, { "epoch": 0.8952380952380953, "grad_norm": 0.2604178853945741, "learning_rate": 3.900431203449628e-05, "loss": 0.4736, "step": 846 }, { "epoch": 0.8962962962962963, "grad_norm": 0.2543159342822874, "learning_rate": 3.898471187769502e-05, "loss": 0.553, "step": 847 }, { "epoch": 0.8973544973544973, "grad_norm": 0.25668454175903266, "learning_rate": 3.896511172089377e-05, "loss": 0.5986, "step": 848 }, { "epoch": 0.8984126984126984, "grad_norm": 0.27128711537893746, "learning_rate": 3.894551156409251e-05, "loss": 0.6013, "step": 849 }, { "epoch": 0.8994708994708994, "grad_norm": 0.25661383335622456, "learning_rate": 3.892591140729126e-05, "loss": 0.6018, "step": 850 }, { "epoch": 0.9005291005291005, "grad_norm": 0.26144285059562283, "learning_rate": 3.8906311250490005e-05, "loss": 0.5544, "step": 851 }, { "epoch": 0.9015873015873016, "grad_norm": 0.24851759763750786, "learning_rate": 3.888671109368875e-05, "loss": 0.5417, "step": 852 }, { "epoch": 0.9026455026455027, "grad_norm": 0.22366950486260603, "learning_rate": 3.8867110936887497e-05, "loss": 0.5216, "step": 853 }, { "epoch": 0.9037037037037037, "grad_norm": 0.25522407995231283, "learning_rate": 3.8847510780086246e-05, "loss": 0.5511, "step": 854 }, { "epoch": 0.9047619047619048, "grad_norm": 0.2459244748277179, "learning_rate": 3.882791062328499e-05, "loss": 0.4738, "step": 855 }, { "epoch": 0.9058201058201059, "grad_norm": 0.23773875962661492, "learning_rate": 3.880831046648373e-05, "loss": 0.5866, "step": 856 }, { "epoch": 0.9068783068783068, "grad_norm": 0.2580145368233666, "learning_rate": 3.878871030968248e-05, "loss": 0.5448, "step": 857 }, { "epoch": 0.9079365079365079, "grad_norm": 0.24118543676000287, "learning_rate": 3.876911015288123e-05, "loss": 0.5223, "step": 858 }, { "epoch": 0.908994708994709, "grad_norm": 0.24514536124293135, "learning_rate": 3.874950999607997e-05, "loss": 0.5641, "step": 859 }, { "epoch": 0.91005291005291, "grad_norm": 0.23032586872284455, "learning_rate": 3.8729909839278714e-05, "loss": 0.5285, "step": 860 }, { "epoch": 0.9111111111111111, "grad_norm": 0.24435066352876703, "learning_rate": 3.8710309682477464e-05, "loss": 0.5423, "step": 861 }, { "epoch": 0.9121693121693122, "grad_norm": 0.2622407130624908, "learning_rate": 3.8690709525676206e-05, "loss": 0.5741, "step": 862 }, { "epoch": 0.9132275132275133, "grad_norm": 1.3636871490670779, "learning_rate": 3.8671109368874955e-05, "loss": 0.7745, "step": 863 }, { "epoch": 0.9142857142857143, "grad_norm": 0.3780723682991901, "learning_rate": 3.86515092120737e-05, "loss": 0.5085, "step": 864 }, { "epoch": 0.9153439153439153, "grad_norm": 0.24726218347601356, "learning_rate": 3.863190905527244e-05, "loss": 0.5712, "step": 865 }, { "epoch": 0.9164021164021164, "grad_norm": 0.27126893307687105, "learning_rate": 3.861230889847119e-05, "loss": 0.5255, "step": 866 }, { "epoch": 0.9174603174603174, "grad_norm": 0.2426515970790078, "learning_rate": 3.859270874166994e-05, "loss": 0.5768, "step": 867 }, { "epoch": 0.9185185185185185, "grad_norm": 0.28594907335057773, "learning_rate": 3.857310858486868e-05, "loss": 0.5528, "step": 868 }, { "epoch": 0.9195767195767196, "grad_norm": 0.23606697714730085, "learning_rate": 3.8553508428067424e-05, "loss": 0.5133, "step": 869 }, { "epoch": 0.9206349206349206, "grad_norm": 0.27926385748395915, "learning_rate": 3.853390827126617e-05, "loss": 0.5796, "step": 870 }, { "epoch": 0.9216931216931217, "grad_norm": 0.24343714813051942, "learning_rate": 3.851430811446492e-05, "loss": 0.5295, "step": 871 }, { "epoch": 0.9227513227513228, "grad_norm": 0.26244227600390824, "learning_rate": 3.8494707957663665e-05, "loss": 0.4905, "step": 872 }, { "epoch": 0.9238095238095239, "grad_norm": 0.2765749086021136, "learning_rate": 3.847510780086241e-05, "loss": 0.5227, "step": 873 }, { "epoch": 0.9248677248677248, "grad_norm": 0.22924047622529786, "learning_rate": 3.8455507644061157e-05, "loss": 0.5447, "step": 874 }, { "epoch": 0.9259259259259259, "grad_norm": 0.26004927012669626, "learning_rate": 3.84359074872599e-05, "loss": 0.5876, "step": 875 }, { "epoch": 0.926984126984127, "grad_norm": 0.23257781793426832, "learning_rate": 3.841630733045865e-05, "loss": 0.4462, "step": 876 }, { "epoch": 0.928042328042328, "grad_norm": 0.23530057335980378, "learning_rate": 3.839670717365739e-05, "loss": 0.5099, "step": 877 }, { "epoch": 0.9291005291005291, "grad_norm": 0.22869829817586848, "learning_rate": 3.837710701685613e-05, "loss": 0.4933, "step": 878 }, { "epoch": 0.9301587301587302, "grad_norm": 0.2538072480043844, "learning_rate": 3.835750686005488e-05, "loss": 0.4838, "step": 879 }, { "epoch": 0.9312169312169312, "grad_norm": 0.23947855637218263, "learning_rate": 3.833790670325363e-05, "loss": 0.5569, "step": 880 }, { "epoch": 0.9322751322751323, "grad_norm": 0.2379463855293634, "learning_rate": 3.8318306546452374e-05, "loss": 0.5169, "step": 881 }, { "epoch": 0.9333333333333333, "grad_norm": 0.2706377388516219, "learning_rate": 3.829870638965112e-05, "loss": 0.4809, "step": 882 }, { "epoch": 0.9343915343915344, "grad_norm": 0.2599921459464922, "learning_rate": 3.8279106232849866e-05, "loss": 0.5511, "step": 883 }, { "epoch": 0.9354497354497354, "grad_norm": 0.23700018733026876, "learning_rate": 3.8259506076048615e-05, "loss": 0.5508, "step": 884 }, { "epoch": 0.9365079365079365, "grad_norm": 0.27148173561209227, "learning_rate": 3.823990591924735e-05, "loss": 0.5598, "step": 885 }, { "epoch": 0.9375661375661376, "grad_norm": 0.23284787985918257, "learning_rate": 3.82203057624461e-05, "loss": 0.4889, "step": 886 }, { "epoch": 0.9386243386243386, "grad_norm": 0.2289097690533918, "learning_rate": 3.820070560564485e-05, "loss": 0.484, "step": 887 }, { "epoch": 0.9396825396825397, "grad_norm": 0.29024067491700634, "learning_rate": 3.818110544884359e-05, "loss": 0.527, "step": 888 }, { "epoch": 0.9407407407407408, "grad_norm": 0.2573481910146356, "learning_rate": 3.8161505292042334e-05, "loss": 0.4921, "step": 889 }, { "epoch": 0.9417989417989417, "grad_norm": 0.2287016832994264, "learning_rate": 3.8141905135241084e-05, "loss": 0.4975, "step": 890 }, { "epoch": 0.9428571428571428, "grad_norm": 0.2793182907010499, "learning_rate": 3.8122304978439826e-05, "loss": 0.5901, "step": 891 }, { "epoch": 0.9439153439153439, "grad_norm": 0.2815450954484471, "learning_rate": 3.8102704821638575e-05, "loss": 0.4907, "step": 892 }, { "epoch": 0.944973544973545, "grad_norm": 0.24825039265577378, "learning_rate": 3.808310466483732e-05, "loss": 0.5845, "step": 893 }, { "epoch": 0.946031746031746, "grad_norm": 0.2684156933456286, "learning_rate": 3.806350450803607e-05, "loss": 0.4972, "step": 894 }, { "epoch": 0.9470899470899471, "grad_norm": 0.2777947126936837, "learning_rate": 3.804390435123481e-05, "loss": 0.5588, "step": 895 }, { "epoch": 0.9481481481481482, "grad_norm": 0.2479091360698014, "learning_rate": 3.802430419443356e-05, "loss": 0.5882, "step": 896 }, { "epoch": 0.9492063492063492, "grad_norm": 0.2657834372378511, "learning_rate": 3.800470403763231e-05, "loss": 0.5495, "step": 897 }, { "epoch": 0.9502645502645503, "grad_norm": 0.9282022073586812, "learning_rate": 3.7985103880831044e-05, "loss": 0.5746, "step": 898 }, { "epoch": 0.9513227513227513, "grad_norm": 0.25172528574506164, "learning_rate": 3.796550372402979e-05, "loss": 0.4896, "step": 899 }, { "epoch": 0.9523809523809523, "grad_norm": 0.2555445883398386, "learning_rate": 3.794590356722854e-05, "loss": 0.5736, "step": 900 }, { "epoch": 0.9534391534391534, "grad_norm": 0.249355354605511, "learning_rate": 3.7926303410427285e-05, "loss": 0.4817, "step": 901 }, { "epoch": 0.9544973544973545, "grad_norm": 0.2149812919434468, "learning_rate": 3.790670325362603e-05, "loss": 0.4524, "step": 902 }, { "epoch": 0.9555555555555556, "grad_norm": 0.2703911118438873, "learning_rate": 3.788710309682478e-05, "loss": 0.5443, "step": 903 }, { "epoch": 0.9566137566137566, "grad_norm": 0.223977288588384, "learning_rate": 3.786750294002352e-05, "loss": 0.4925, "step": 904 }, { "epoch": 0.9576719576719577, "grad_norm": 0.27011460419403965, "learning_rate": 3.784790278322227e-05, "loss": 0.5812, "step": 905 }, { "epoch": 0.9587301587301588, "grad_norm": 0.23957948838890472, "learning_rate": 3.782830262642101e-05, "loss": 0.5007, "step": 906 }, { "epoch": 0.9597883597883597, "grad_norm": 0.2264533105409485, "learning_rate": 3.780870246961976e-05, "loss": 0.5141, "step": 907 }, { "epoch": 0.9608465608465608, "grad_norm": 0.24934710629027704, "learning_rate": 3.77891023128185e-05, "loss": 0.5271, "step": 908 }, { "epoch": 0.9619047619047619, "grad_norm": 0.24400597517616687, "learning_rate": 3.776950215601725e-05, "loss": 0.5524, "step": 909 }, { "epoch": 0.9629629629629629, "grad_norm": 0.24087812537974612, "learning_rate": 3.7749901999215994e-05, "loss": 0.4046, "step": 910 }, { "epoch": 0.964021164021164, "grad_norm": 2.6978887892991747, "learning_rate": 3.773030184241474e-05, "loss": 0.5807, "step": 911 }, { "epoch": 0.9650793650793651, "grad_norm": 0.31338696022947576, "learning_rate": 3.7710701685613486e-05, "loss": 0.5033, "step": 912 }, { "epoch": 0.9661375661375662, "grad_norm": 0.3061821128532338, "learning_rate": 3.7691101528812235e-05, "loss": 0.5131, "step": 913 }, { "epoch": 0.9671957671957672, "grad_norm": 0.23119947689356746, "learning_rate": 3.767150137201098e-05, "loss": 0.4257, "step": 914 }, { "epoch": 0.9682539682539683, "grad_norm": 0.27958747828152924, "learning_rate": 3.765190121520972e-05, "loss": 0.543, "step": 915 }, { "epoch": 0.9693121693121693, "grad_norm": 0.30495291790476814, "learning_rate": 3.763230105840847e-05, "loss": 0.5112, "step": 916 }, { "epoch": 0.9703703703703703, "grad_norm": 0.2815945843930539, "learning_rate": 3.761270090160722e-05, "loss": 0.6096, "step": 917 }, { "epoch": 0.9714285714285714, "grad_norm": 0.27333650578935825, "learning_rate": 3.759310074480596e-05, "loss": 0.5376, "step": 918 }, { "epoch": 0.9724867724867725, "grad_norm": 0.2982346639473295, "learning_rate": 3.7573500588004704e-05, "loss": 0.452, "step": 919 }, { "epoch": 0.9735449735449735, "grad_norm": 0.2677922848009343, "learning_rate": 3.755390043120345e-05, "loss": 0.4959, "step": 920 }, { "epoch": 0.9746031746031746, "grad_norm": 0.2598899505130424, "learning_rate": 3.7534300274402196e-05, "loss": 0.5954, "step": 921 }, { "epoch": 0.9756613756613757, "grad_norm": 0.29688775573892967, "learning_rate": 3.7514700117600945e-05, "loss": 0.4943, "step": 922 }, { "epoch": 0.9767195767195768, "grad_norm": 0.21278741966910175, "learning_rate": 3.749509996079969e-05, "loss": 0.469, "step": 923 }, { "epoch": 0.9777777777777777, "grad_norm": 0.27458087883654253, "learning_rate": 3.747549980399843e-05, "loss": 0.5852, "step": 924 }, { "epoch": 0.9788359788359788, "grad_norm": 0.26092414227041094, "learning_rate": 3.745589964719718e-05, "loss": 0.5247, "step": 925 }, { "epoch": 0.9798941798941799, "grad_norm": 0.22726587133642012, "learning_rate": 3.743629949039593e-05, "loss": 0.5845, "step": 926 }, { "epoch": 0.9809523809523809, "grad_norm": 0.24328946071436178, "learning_rate": 3.741669933359467e-05, "loss": 0.5947, "step": 927 }, { "epoch": 0.982010582010582, "grad_norm": 0.25227625801156794, "learning_rate": 3.7397099176793413e-05, "loss": 0.5245, "step": 928 }, { "epoch": 0.9830687830687831, "grad_norm": 1.8750127781404309, "learning_rate": 3.737749901999216e-05, "loss": 0.5496, "step": 929 }, { "epoch": 0.9841269841269841, "grad_norm": 0.2690191369601627, "learning_rate": 3.735789886319091e-05, "loss": 0.5706, "step": 930 }, { "epoch": 0.9851851851851852, "grad_norm": 0.23072633191225864, "learning_rate": 3.7338298706389654e-05, "loss": 0.5585, "step": 931 }, { "epoch": 0.9862433862433863, "grad_norm": 0.24321911538830931, "learning_rate": 3.73186985495884e-05, "loss": 0.5314, "step": 932 }, { "epoch": 0.9873015873015873, "grad_norm": 0.2265652217349377, "learning_rate": 3.7299098392787146e-05, "loss": 0.5157, "step": 933 }, { "epoch": 0.9883597883597883, "grad_norm": 0.2798879913889636, "learning_rate": 3.727949823598589e-05, "loss": 0.527, "step": 934 }, { "epoch": 0.9894179894179894, "grad_norm": 0.3392043127421207, "learning_rate": 3.725989807918464e-05, "loss": 0.5649, "step": 935 }, { "epoch": 0.9904761904761905, "grad_norm": 0.46994870334965927, "learning_rate": 3.724029792238338e-05, "loss": 0.5019, "step": 936 }, { "epoch": 0.9915343915343915, "grad_norm": 0.31670821542256694, "learning_rate": 3.722069776558212e-05, "loss": 0.4984, "step": 937 }, { "epoch": 0.9925925925925926, "grad_norm": 0.30042583096668124, "learning_rate": 3.720109760878087e-05, "loss": 0.443, "step": 938 }, { "epoch": 0.9936507936507937, "grad_norm": 0.26554660858783213, "learning_rate": 3.718149745197962e-05, "loss": 0.5143, "step": 939 }, { "epoch": 0.9947089947089947, "grad_norm": 0.31047007588956205, "learning_rate": 3.7161897295178364e-05, "loss": 0.5113, "step": 940 }, { "epoch": 0.9957671957671957, "grad_norm": 0.32328394275671823, "learning_rate": 3.7142297138377106e-05, "loss": 0.5627, "step": 941 }, { "epoch": 0.9968253968253968, "grad_norm": 0.30050781187053605, "learning_rate": 3.7122696981575856e-05, "loss": 0.4492, "step": 942 }, { "epoch": 0.9978835978835979, "grad_norm": 0.2883856032205587, "learning_rate": 3.7103096824774605e-05, "loss": 0.573, "step": 943 }, { "epoch": 0.9989417989417989, "grad_norm": 0.3095113640917161, "learning_rate": 3.708349666797335e-05, "loss": 0.5471, "step": 944 }, { "epoch": 1.0, "grad_norm": 0.2740280000061703, "learning_rate": 3.706389651117209e-05, "loss": 0.5048, "step": 945 }, { "epoch": 1.001058201058201, "grad_norm": 0.3430281338192382, "learning_rate": 3.704429635437084e-05, "loss": 0.484, "step": 946 }, { "epoch": 1.0021164021164022, "grad_norm": 0.5765855281602044, "learning_rate": 3.702469619756958e-05, "loss": 0.421, "step": 947 }, { "epoch": 1.0031746031746032, "grad_norm": 0.32549789453659606, "learning_rate": 3.700509604076833e-05, "loss": 0.5094, "step": 948 }, { "epoch": 1.0042328042328041, "grad_norm": 0.27167110880361595, "learning_rate": 3.698549588396707e-05, "loss": 0.4292, "step": 949 }, { "epoch": 1.0052910052910053, "grad_norm": 0.2634410312734115, "learning_rate": 3.6965895727165816e-05, "loss": 0.4056, "step": 950 }, { "epoch": 1.0063492063492063, "grad_norm": 0.290932789089174, "learning_rate": 3.6946295570364565e-05, "loss": 0.4966, "step": 951 }, { "epoch": 1.0074074074074073, "grad_norm": 0.277649198156178, "learning_rate": 3.6926695413563314e-05, "loss": 0.5468, "step": 952 }, { "epoch": 1.0084656084656085, "grad_norm": 0.2871264585009383, "learning_rate": 3.690709525676205e-05, "loss": 0.4619, "step": 953 }, { "epoch": 1.0095238095238095, "grad_norm": 0.26922960005310304, "learning_rate": 3.68874950999608e-05, "loss": 0.4126, "step": 954 }, { "epoch": 1.0105820105820107, "grad_norm": 0.2630654741035761, "learning_rate": 3.686789494315955e-05, "loss": 0.4967, "step": 955 }, { "epoch": 1.0116402116402117, "grad_norm": 0.30760949861741693, "learning_rate": 3.68482947863583e-05, "loss": 0.3988, "step": 956 }, { "epoch": 1.0126984126984127, "grad_norm": 0.30191598263981867, "learning_rate": 3.6828694629557034e-05, "loss": 0.4891, "step": 957 }, { "epoch": 1.0137566137566139, "grad_norm": 0.24918234554246604, "learning_rate": 3.680909447275578e-05, "loss": 0.4265, "step": 958 }, { "epoch": 1.0148148148148148, "grad_norm": 0.25444420109426397, "learning_rate": 3.678949431595453e-05, "loss": 0.4755, "step": 959 }, { "epoch": 1.0158730158730158, "grad_norm": 0.2704029549763871, "learning_rate": 3.6769894159153275e-05, "loss": 0.4622, "step": 960 }, { "epoch": 1.016931216931217, "grad_norm": 0.25814705258519083, "learning_rate": 3.675029400235202e-05, "loss": 0.45, "step": 961 }, { "epoch": 1.017989417989418, "grad_norm": 0.24396615670006316, "learning_rate": 3.6730693845550766e-05, "loss": 0.4621, "step": 962 }, { "epoch": 1.019047619047619, "grad_norm": 0.24613409465997452, "learning_rate": 3.671109368874951e-05, "loss": 0.3997, "step": 963 }, { "epoch": 1.0201058201058202, "grad_norm": 0.2446865998835932, "learning_rate": 3.669149353194826e-05, "loss": 0.4312, "step": 964 }, { "epoch": 1.0211640211640212, "grad_norm": 0.24064127384639672, "learning_rate": 3.6671893375147e-05, "loss": 0.4879, "step": 965 }, { "epoch": 1.0222222222222221, "grad_norm": 0.24643135413567363, "learning_rate": 3.665229321834575e-05, "loss": 0.4811, "step": 966 }, { "epoch": 1.0232804232804233, "grad_norm": 0.3002457243261394, "learning_rate": 3.663269306154449e-05, "loss": 0.5209, "step": 967 }, { "epoch": 1.0243386243386243, "grad_norm": 0.22426207621628652, "learning_rate": 3.661309290474324e-05, "loss": 0.4123, "step": 968 }, { "epoch": 1.0253968253968253, "grad_norm": 0.24667559321374757, "learning_rate": 3.659349274794199e-05, "loss": 0.462, "step": 969 }, { "epoch": 1.0264550264550265, "grad_norm": 0.21521332687361075, "learning_rate": 3.6573892591140727e-05, "loss": 0.3874, "step": 970 }, { "epoch": 1.0275132275132275, "grad_norm": 0.21371142578879992, "learning_rate": 3.6554292434339476e-05, "loss": 0.4216, "step": 971 }, { "epoch": 1.0285714285714285, "grad_norm": 0.2457752256589608, "learning_rate": 3.6534692277538225e-05, "loss": 0.4751, "step": 972 }, { "epoch": 1.0296296296296297, "grad_norm": 0.23036724262890812, "learning_rate": 3.651509212073697e-05, "loss": 0.4582, "step": 973 }, { "epoch": 1.0306878306878307, "grad_norm": 0.2398481443361909, "learning_rate": 3.649549196393571e-05, "loss": 0.4908, "step": 974 }, { "epoch": 1.0317460317460316, "grad_norm": 0.2191458862749879, "learning_rate": 3.647589180713446e-05, "loss": 0.3675, "step": 975 }, { "epoch": 1.0328042328042328, "grad_norm": 0.24624710147713, "learning_rate": 3.64562916503332e-05, "loss": 0.4168, "step": 976 }, { "epoch": 1.0338624338624338, "grad_norm": 0.2505135808681229, "learning_rate": 3.643669149353195e-05, "loss": 0.4317, "step": 977 }, { "epoch": 1.034920634920635, "grad_norm": 0.23347770709457338, "learning_rate": 3.6417091336730694e-05, "loss": 0.4385, "step": 978 }, { "epoch": 1.035978835978836, "grad_norm": 0.228128885877278, "learning_rate": 3.639749117992944e-05, "loss": 0.4258, "step": 979 }, { "epoch": 1.037037037037037, "grad_norm": 0.2629471686081972, "learning_rate": 3.6377891023128185e-05, "loss": 0.4682, "step": 980 }, { "epoch": 1.0380952380952382, "grad_norm": 0.2649161639864007, "learning_rate": 3.6358290866326935e-05, "loss": 0.4061, "step": 981 }, { "epoch": 1.0391534391534392, "grad_norm": 0.23269995847151623, "learning_rate": 3.633869070952568e-05, "loss": 0.3948, "step": 982 }, { "epoch": 1.0402116402116401, "grad_norm": 0.3069159899042451, "learning_rate": 3.631909055272442e-05, "loss": 0.4964, "step": 983 }, { "epoch": 1.0412698412698413, "grad_norm": 0.28329260505792503, "learning_rate": 3.629949039592317e-05, "loss": 0.446, "step": 984 }, { "epoch": 1.0423280423280423, "grad_norm": 0.22085634036978702, "learning_rate": 3.627989023912192e-05, "loss": 0.397, "step": 985 }, { "epoch": 1.0433862433862433, "grad_norm": 0.2533209615376046, "learning_rate": 3.626029008232066e-05, "loss": 0.4478, "step": 986 }, { "epoch": 1.0444444444444445, "grad_norm": 0.24209341289293657, "learning_rate": 3.62406899255194e-05, "loss": 0.4514, "step": 987 }, { "epoch": 1.0455026455026455, "grad_norm": 0.22467546448533063, "learning_rate": 3.622108976871815e-05, "loss": 0.4335, "step": 988 }, { "epoch": 1.0465608465608465, "grad_norm": 0.21049472574558023, "learning_rate": 3.6201489611916895e-05, "loss": 0.408, "step": 989 }, { "epoch": 1.0476190476190477, "grad_norm": 0.27894693629024636, "learning_rate": 3.6181889455115644e-05, "loss": 0.4497, "step": 990 }, { "epoch": 1.0486772486772487, "grad_norm": 0.2569069796077995, "learning_rate": 3.6162289298314387e-05, "loss": 0.4601, "step": 991 }, { "epoch": 1.0497354497354496, "grad_norm": 0.2284829852995991, "learning_rate": 3.6142689141513136e-05, "loss": 0.3953, "step": 992 }, { "epoch": 1.0507936507936508, "grad_norm": 0.29209053034281596, "learning_rate": 3.612308898471188e-05, "loss": 0.5659, "step": 993 }, { "epoch": 1.0518518518518518, "grad_norm": 0.3674972112779229, "learning_rate": 3.610348882791063e-05, "loss": 0.5056, "step": 994 }, { "epoch": 1.052910052910053, "grad_norm": 0.24428585787867374, "learning_rate": 3.608388867110937e-05, "loss": 0.4746, "step": 995 }, { "epoch": 1.053968253968254, "grad_norm": 0.24244488023201471, "learning_rate": 3.606428851430811e-05, "loss": 0.4863, "step": 996 }, { "epoch": 1.055026455026455, "grad_norm": 0.2261470911889456, "learning_rate": 3.604468835750686e-05, "loss": 0.449, "step": 997 }, { "epoch": 1.0560846560846562, "grad_norm": 0.23100103297173594, "learning_rate": 3.602508820070561e-05, "loss": 0.4325, "step": 998 }, { "epoch": 1.0571428571428572, "grad_norm": 0.2383765185611898, "learning_rate": 3.6005488043904354e-05, "loss": 0.4625, "step": 999 }, { "epoch": 1.0582010582010581, "grad_norm": 0.22794426514581065, "learning_rate": 3.5985887887103096e-05, "loss": 0.4055, "step": 1000 }, { "epoch": 1.0592592592592593, "grad_norm": 0.24878598922435188, "learning_rate": 3.5966287730301845e-05, "loss": 0.4176, "step": 1001 }, { "epoch": 1.0603174603174603, "grad_norm": 0.24946112914172103, "learning_rate": 3.594668757350059e-05, "loss": 0.4949, "step": 1002 }, { "epoch": 1.0613756613756613, "grad_norm": 0.24129845809349035, "learning_rate": 3.592708741669934e-05, "loss": 0.3912, "step": 1003 }, { "epoch": 1.0624338624338625, "grad_norm": 0.24281750665894428, "learning_rate": 3.590748725989808e-05, "loss": 0.4568, "step": 1004 }, { "epoch": 1.0634920634920635, "grad_norm": 0.20760278598384954, "learning_rate": 3.588788710309683e-05, "loss": 0.3948, "step": 1005 }, { "epoch": 1.0645502645502645, "grad_norm": 0.23431721866007746, "learning_rate": 3.586828694629557e-05, "loss": 0.424, "step": 1006 }, { "epoch": 1.0656084656084657, "grad_norm": 0.2471701897811236, "learning_rate": 3.584868678949432e-05, "loss": 0.4256, "step": 1007 }, { "epoch": 1.0666666666666667, "grad_norm": 0.2302012676934357, "learning_rate": 3.582908663269306e-05, "loss": 0.4277, "step": 1008 }, { "epoch": 1.0677248677248676, "grad_norm": 0.22452613295126128, "learning_rate": 3.5809486475891806e-05, "loss": 0.4465, "step": 1009 }, { "epoch": 1.0687830687830688, "grad_norm": 0.2257507725040729, "learning_rate": 3.5789886319090555e-05, "loss": 0.4698, "step": 1010 }, { "epoch": 1.0698412698412698, "grad_norm": 0.24453737754436786, "learning_rate": 3.5770286162289304e-05, "loss": 0.4469, "step": 1011 }, { "epoch": 1.0708994708994708, "grad_norm": 0.24956877063093863, "learning_rate": 3.5750686005488047e-05, "loss": 0.4626, "step": 1012 }, { "epoch": 1.071957671957672, "grad_norm": 0.2600831587111751, "learning_rate": 3.573108584868679e-05, "loss": 0.4537, "step": 1013 }, { "epoch": 1.073015873015873, "grad_norm": 4.587346913530301, "learning_rate": 3.571148569188554e-05, "loss": 0.558, "step": 1014 }, { "epoch": 1.074074074074074, "grad_norm": 0.29359742422528, "learning_rate": 3.569188553508429e-05, "loss": 0.4682, "step": 1015 }, { "epoch": 1.0751322751322752, "grad_norm": 0.31833732588739005, "learning_rate": 3.567228537828303e-05, "loss": 0.4743, "step": 1016 }, { "epoch": 1.0761904761904761, "grad_norm": 0.2553867628775377, "learning_rate": 3.565268522148177e-05, "loss": 0.551, "step": 1017 }, { "epoch": 1.0772486772486773, "grad_norm": 0.24879754575790247, "learning_rate": 3.563308506468052e-05, "loss": 0.4524, "step": 1018 }, { "epoch": 1.0783068783068783, "grad_norm": 0.28766655345957765, "learning_rate": 3.5613484907879264e-05, "loss": 0.4384, "step": 1019 }, { "epoch": 1.0793650793650793, "grad_norm": 0.41507355236002946, "learning_rate": 3.5593884751078014e-05, "loss": 0.4206, "step": 1020 }, { "epoch": 1.0804232804232805, "grad_norm": 0.23507148230554162, "learning_rate": 3.5574284594276756e-05, "loss": 0.4353, "step": 1021 }, { "epoch": 1.0814814814814815, "grad_norm": 0.2622046524529905, "learning_rate": 3.55546844374755e-05, "loss": 0.487, "step": 1022 }, { "epoch": 1.0825396825396825, "grad_norm": 0.2963869501406697, "learning_rate": 3.553508428067425e-05, "loss": 0.5234, "step": 1023 }, { "epoch": 1.0835978835978837, "grad_norm": 0.22883220209770097, "learning_rate": 3.5515484123873e-05, "loss": 0.4493, "step": 1024 }, { "epoch": 1.0846560846560847, "grad_norm": 0.2569611944727975, "learning_rate": 3.549588396707173e-05, "loss": 0.466, "step": 1025 }, { "epoch": 1.0857142857142856, "grad_norm": 0.254023601263868, "learning_rate": 3.547628381027048e-05, "loss": 0.459, "step": 1026 }, { "epoch": 1.0867724867724868, "grad_norm": 0.23407024600575801, "learning_rate": 3.545668365346923e-05, "loss": 0.45, "step": 1027 }, { "epoch": 1.0878306878306878, "grad_norm": 0.2599527517255244, "learning_rate": 3.543708349666798e-05, "loss": 0.4115, "step": 1028 }, { "epoch": 1.0888888888888888, "grad_norm": 0.22311611567918718, "learning_rate": 3.5417483339866716e-05, "loss": 0.3979, "step": 1029 }, { "epoch": 1.08994708994709, "grad_norm": 0.2299712856828212, "learning_rate": 3.5397883183065466e-05, "loss": 0.3694, "step": 1030 }, { "epoch": 1.091005291005291, "grad_norm": 0.21532417571199142, "learning_rate": 3.5378283026264215e-05, "loss": 0.3551, "step": 1031 }, { "epoch": 1.0920634920634922, "grad_norm": 0.23843932815464314, "learning_rate": 3.535868286946296e-05, "loss": 0.4513, "step": 1032 }, { "epoch": 1.0931216931216932, "grad_norm": 0.21531850573097505, "learning_rate": 3.53390827126617e-05, "loss": 0.3931, "step": 1033 }, { "epoch": 1.0941798941798941, "grad_norm": 0.3068198759456981, "learning_rate": 3.531948255586045e-05, "loss": 0.4891, "step": 1034 }, { "epoch": 1.0952380952380953, "grad_norm": 0.23836601091918505, "learning_rate": 3.529988239905919e-05, "loss": 0.439, "step": 1035 }, { "epoch": 1.0962962962962963, "grad_norm": 0.2437498779338309, "learning_rate": 3.528028224225794e-05, "loss": 0.4149, "step": 1036 }, { "epoch": 1.0973544973544973, "grad_norm": 0.27351894671972754, "learning_rate": 3.526068208545668e-05, "loss": 0.4416, "step": 1037 }, { "epoch": 1.0984126984126985, "grad_norm": 0.27770230078648367, "learning_rate": 3.5241081928655426e-05, "loss": 0.4884, "step": 1038 }, { "epoch": 1.0994708994708995, "grad_norm": 0.25718315589569163, "learning_rate": 3.5221481771854175e-05, "loss": 0.4578, "step": 1039 }, { "epoch": 1.1005291005291005, "grad_norm": 0.2594326299087, "learning_rate": 3.5201881615052924e-05, "loss": 0.4864, "step": 1040 }, { "epoch": 1.1015873015873017, "grad_norm": 0.2472630187462841, "learning_rate": 3.5182281458251674e-05, "loss": 0.4451, "step": 1041 }, { "epoch": 1.1026455026455027, "grad_norm": 0.25781560522935376, "learning_rate": 3.516268130145041e-05, "loss": 0.4182, "step": 1042 }, { "epoch": 1.1037037037037036, "grad_norm": 0.2535547232578919, "learning_rate": 3.514308114464916e-05, "loss": 0.4749, "step": 1043 }, { "epoch": 1.1047619047619048, "grad_norm": 0.254021416442326, "learning_rate": 3.512348098784791e-05, "loss": 0.471, "step": 1044 }, { "epoch": 1.1058201058201058, "grad_norm": 0.28102597311344235, "learning_rate": 3.510388083104665e-05, "loss": 0.4615, "step": 1045 }, { "epoch": 1.1068783068783068, "grad_norm": 0.23113213821636622, "learning_rate": 3.508428067424539e-05, "loss": 0.4147, "step": 1046 }, { "epoch": 1.107936507936508, "grad_norm": 0.2474923745981431, "learning_rate": 3.506468051744414e-05, "loss": 0.4437, "step": 1047 }, { "epoch": 1.108994708994709, "grad_norm": 0.24830934850829475, "learning_rate": 3.5045080360642884e-05, "loss": 0.4376, "step": 1048 }, { "epoch": 1.11005291005291, "grad_norm": 0.47806726597633753, "learning_rate": 3.5025480203841634e-05, "loss": 0.4324, "step": 1049 }, { "epoch": 1.1111111111111112, "grad_norm": 0.32025751473392294, "learning_rate": 3.5005880047040376e-05, "loss": 0.5246, "step": 1050 }, { "epoch": 1.1121693121693121, "grad_norm": 0.28097198860260764, "learning_rate": 3.498627989023912e-05, "loss": 0.4598, "step": 1051 }, { "epoch": 1.1132275132275131, "grad_norm": 0.254947622562914, "learning_rate": 3.496667973343787e-05, "loss": 0.4222, "step": 1052 }, { "epoch": 1.1142857142857143, "grad_norm": 0.3349593604089576, "learning_rate": 3.494707957663662e-05, "loss": 0.4155, "step": 1053 }, { "epoch": 1.1153439153439153, "grad_norm": 0.26874038889591473, "learning_rate": 3.492747941983536e-05, "loss": 0.4375, "step": 1054 }, { "epoch": 1.1164021164021163, "grad_norm": 0.26633236590448656, "learning_rate": 3.49078792630341e-05, "loss": 0.4267, "step": 1055 }, { "epoch": 1.1174603174603175, "grad_norm": 0.2884717153428804, "learning_rate": 3.488827910623285e-05, "loss": 0.4257, "step": 1056 }, { "epoch": 1.1185185185185185, "grad_norm": 0.4452379521735504, "learning_rate": 3.48686789494316e-05, "loss": 0.4332, "step": 1057 }, { "epoch": 1.1195767195767197, "grad_norm": 0.7702289596361585, "learning_rate": 3.484907879263034e-05, "loss": 0.4349, "step": 1058 }, { "epoch": 1.1206349206349207, "grad_norm": 0.3656397372057208, "learning_rate": 3.4829478635829086e-05, "loss": 0.4238, "step": 1059 }, { "epoch": 1.1216931216931216, "grad_norm": 0.4866179217310575, "learning_rate": 3.4809878479027835e-05, "loss": 0.4216, "step": 1060 }, { "epoch": 1.1227513227513228, "grad_norm": 0.33474959074863353, "learning_rate": 3.479027832222658e-05, "loss": 0.4418, "step": 1061 }, { "epoch": 1.1238095238095238, "grad_norm": 0.27607814779984247, "learning_rate": 3.477067816542533e-05, "loss": 0.4364, "step": 1062 }, { "epoch": 1.1248677248677248, "grad_norm": 0.3407943035819788, "learning_rate": 3.475107800862407e-05, "loss": 0.4665, "step": 1063 }, { "epoch": 1.125925925925926, "grad_norm": 0.33884649086749913, "learning_rate": 3.473147785182282e-05, "loss": 0.4543, "step": 1064 }, { "epoch": 1.126984126984127, "grad_norm": 0.29045801993635223, "learning_rate": 3.471187769502156e-05, "loss": 0.4612, "step": 1065 }, { "epoch": 1.128042328042328, "grad_norm": 0.3424125515301145, "learning_rate": 3.469227753822031e-05, "loss": 0.5199, "step": 1066 }, { "epoch": 1.1291005291005292, "grad_norm": 0.29631324600571024, "learning_rate": 3.467267738141905e-05, "loss": 0.4505, "step": 1067 }, { "epoch": 1.1301587301587301, "grad_norm": 0.2254979652739058, "learning_rate": 3.4653077224617795e-05, "loss": 0.4902, "step": 1068 }, { "epoch": 1.1312169312169311, "grad_norm": 0.29453491815047816, "learning_rate": 3.4633477067816544e-05, "loss": 0.4537, "step": 1069 }, { "epoch": 1.1322751322751323, "grad_norm": 0.24120272460563255, "learning_rate": 3.4613876911015294e-05, "loss": 0.4156, "step": 1070 }, { "epoch": 1.1333333333333333, "grad_norm": 0.2417667303754181, "learning_rate": 3.4594276754214036e-05, "loss": 0.464, "step": 1071 }, { "epoch": 1.1343915343915345, "grad_norm": 0.24479296740003978, "learning_rate": 3.457467659741278e-05, "loss": 0.425, "step": 1072 }, { "epoch": 1.1354497354497355, "grad_norm": 0.2606469824823141, "learning_rate": 3.455507644061153e-05, "loss": 0.4171, "step": 1073 }, { "epoch": 1.1365079365079365, "grad_norm": 1.160342854664124, "learning_rate": 3.453547628381027e-05, "loss": 0.4907, "step": 1074 }, { "epoch": 1.1375661375661377, "grad_norm": 0.27475204211746457, "learning_rate": 3.451587612700902e-05, "loss": 0.4412, "step": 1075 }, { "epoch": 1.1386243386243386, "grad_norm": 0.27882185123707176, "learning_rate": 3.449627597020776e-05, "loss": 0.4304, "step": 1076 }, { "epoch": 1.1396825396825396, "grad_norm": 0.24769703151282585, "learning_rate": 3.447667581340651e-05, "loss": 0.4258, "step": 1077 }, { "epoch": 1.1407407407407408, "grad_norm": 0.2624061999879434, "learning_rate": 3.4457075656605254e-05, "loss": 0.4202, "step": 1078 }, { "epoch": 1.1417989417989418, "grad_norm": 0.23516866564878172, "learning_rate": 3.4437475499804e-05, "loss": 0.4201, "step": 1079 }, { "epoch": 1.1428571428571428, "grad_norm": 0.2591076428810598, "learning_rate": 3.4417875343002746e-05, "loss": 0.4179, "step": 1080 }, { "epoch": 1.143915343915344, "grad_norm": 0.245392393657956, "learning_rate": 3.439827518620149e-05, "loss": 0.3815, "step": 1081 }, { "epoch": 1.144973544973545, "grad_norm": 0.23935170354064206, "learning_rate": 3.437867502940024e-05, "loss": 0.4652, "step": 1082 }, { "epoch": 1.146031746031746, "grad_norm": 0.24157446804740304, "learning_rate": 3.435907487259899e-05, "loss": 0.4123, "step": 1083 }, { "epoch": 1.1470899470899472, "grad_norm": 0.2997924271290018, "learning_rate": 3.433947471579773e-05, "loss": 0.4711, "step": 1084 }, { "epoch": 1.1481481481481481, "grad_norm": 0.2513216731368017, "learning_rate": 3.431987455899647e-05, "loss": 0.4065, "step": 1085 }, { "epoch": 1.1492063492063491, "grad_norm": 0.2522352354653566, "learning_rate": 3.430027440219522e-05, "loss": 0.4234, "step": 1086 }, { "epoch": 1.1502645502645503, "grad_norm": 0.29099731437781645, "learning_rate": 3.4280674245393963e-05, "loss": 0.4617, "step": 1087 }, { "epoch": 1.1513227513227513, "grad_norm": 0.2940806275764955, "learning_rate": 3.426107408859271e-05, "loss": 0.4231, "step": 1088 }, { "epoch": 1.1523809523809523, "grad_norm": 0.27282227795834785, "learning_rate": 3.4241473931791455e-05, "loss": 0.4884, "step": 1089 }, { "epoch": 1.1534391534391535, "grad_norm": 0.25357226775494324, "learning_rate": 3.4221873774990204e-05, "loss": 0.4817, "step": 1090 }, { "epoch": 1.1544973544973545, "grad_norm": 0.29654179010898835, "learning_rate": 3.420227361818895e-05, "loss": 0.4057, "step": 1091 }, { "epoch": 1.1555555555555554, "grad_norm": 0.27272067824064034, "learning_rate": 3.4182673461387696e-05, "loss": 0.4649, "step": 1092 }, { "epoch": 1.1566137566137566, "grad_norm": 0.25231760876479825, "learning_rate": 3.416307330458644e-05, "loss": 0.4467, "step": 1093 }, { "epoch": 1.1576719576719576, "grad_norm": 1.4439868836946126, "learning_rate": 3.414347314778518e-05, "loss": 0.5031, "step": 1094 }, { "epoch": 1.1587301587301586, "grad_norm": 0.2669246549318654, "learning_rate": 3.412387299098393e-05, "loss": 0.4242, "step": 1095 }, { "epoch": 1.1597883597883598, "grad_norm": 0.4725447349327677, "learning_rate": 3.410427283418268e-05, "loss": 0.4037, "step": 1096 }, { "epoch": 1.1608465608465608, "grad_norm": 0.24176894320740916, "learning_rate": 3.4084672677381415e-05, "loss": 0.4461, "step": 1097 }, { "epoch": 1.161904761904762, "grad_norm": 0.2571630439924496, "learning_rate": 3.4065072520580165e-05, "loss": 0.4435, "step": 1098 }, { "epoch": 1.162962962962963, "grad_norm": 0.25405311821960447, "learning_rate": 3.4045472363778914e-05, "loss": 0.4008, "step": 1099 }, { "epoch": 1.164021164021164, "grad_norm": 0.2360510636639726, "learning_rate": 3.4025872206977656e-05, "loss": 0.4418, "step": 1100 }, { "epoch": 1.1650793650793652, "grad_norm": 0.24349573097824267, "learning_rate": 3.40062720501764e-05, "loss": 0.4242, "step": 1101 }, { "epoch": 1.1661375661375661, "grad_norm": 0.22205185004409997, "learning_rate": 3.398667189337515e-05, "loss": 0.4319, "step": 1102 }, { "epoch": 1.1671957671957671, "grad_norm": 0.25476677264568914, "learning_rate": 3.39670717365739e-05, "loss": 0.4214, "step": 1103 }, { "epoch": 1.1682539682539683, "grad_norm": 0.24140105278129537, "learning_rate": 3.394747157977264e-05, "loss": 0.4331, "step": 1104 }, { "epoch": 1.1693121693121693, "grad_norm": 0.23526559056529114, "learning_rate": 3.392787142297138e-05, "loss": 0.4854, "step": 1105 }, { "epoch": 1.1703703703703703, "grad_norm": 3.2639568571243096, "learning_rate": 3.390827126617013e-05, "loss": 0.444, "step": 1106 }, { "epoch": 1.1714285714285715, "grad_norm": 0.29307877776027735, "learning_rate": 3.3888671109368874e-05, "loss": 0.491, "step": 1107 }, { "epoch": 1.1724867724867725, "grad_norm": 0.25173250808327663, "learning_rate": 3.3869070952567623e-05, "loss": 0.4304, "step": 1108 }, { "epoch": 1.1735449735449737, "grad_norm": 0.23834729259967638, "learning_rate": 3.3849470795766366e-05, "loss": 0.4242, "step": 1109 }, { "epoch": 1.1746031746031746, "grad_norm": 0.2820981143799431, "learning_rate": 3.382987063896511e-05, "loss": 0.4531, "step": 1110 }, { "epoch": 1.1756613756613756, "grad_norm": 0.2684834031217499, "learning_rate": 3.381027048216386e-05, "loss": 0.4423, "step": 1111 }, { "epoch": 1.1767195767195768, "grad_norm": 0.22060471509608026, "learning_rate": 3.379067032536261e-05, "loss": 0.4567, "step": 1112 }, { "epoch": 1.1777777777777778, "grad_norm": 0.2938768585311419, "learning_rate": 3.3771070168561356e-05, "loss": 0.5653, "step": 1113 }, { "epoch": 1.1788359788359788, "grad_norm": 0.2185303188886883, "learning_rate": 3.375147001176009e-05, "loss": 0.4376, "step": 1114 }, { "epoch": 1.17989417989418, "grad_norm": 0.31115860640653836, "learning_rate": 3.373186985495884e-05, "loss": 0.477, "step": 1115 }, { "epoch": 1.180952380952381, "grad_norm": 0.2546303170293118, "learning_rate": 3.371226969815759e-05, "loss": 0.4421, "step": 1116 }, { "epoch": 1.182010582010582, "grad_norm": 0.24401462771526597, "learning_rate": 3.369266954135633e-05, "loss": 0.4158, "step": 1117 }, { "epoch": 1.1830687830687832, "grad_norm": 0.2371965772098529, "learning_rate": 3.3673069384555075e-05, "loss": 0.4148, "step": 1118 }, { "epoch": 1.1841269841269841, "grad_norm": 0.25311128441059516, "learning_rate": 3.3653469227753825e-05, "loss": 0.3996, "step": 1119 }, { "epoch": 1.1851851851851851, "grad_norm": 0.25406846795302634, "learning_rate": 3.363386907095257e-05, "loss": 0.4802, "step": 1120 }, { "epoch": 1.1862433862433863, "grad_norm": 0.28969576250269813, "learning_rate": 3.3614268914151316e-05, "loss": 0.4434, "step": 1121 }, { "epoch": 1.1873015873015873, "grad_norm": 0.25113888422786323, "learning_rate": 3.359466875735006e-05, "loss": 0.4693, "step": 1122 }, { "epoch": 1.1883597883597883, "grad_norm": 0.26242305511278513, "learning_rate": 3.35750686005488e-05, "loss": 0.4408, "step": 1123 }, { "epoch": 1.1894179894179895, "grad_norm": 0.24851482302114736, "learning_rate": 3.355546844374755e-05, "loss": 0.4855, "step": 1124 }, { "epoch": 1.1904761904761905, "grad_norm": 0.21264747963193062, "learning_rate": 3.35358682869463e-05, "loss": 0.403, "step": 1125 }, { "epoch": 1.1915343915343914, "grad_norm": 0.22264008089350337, "learning_rate": 3.351626813014504e-05, "loss": 0.4466, "step": 1126 }, { "epoch": 1.1925925925925926, "grad_norm": 0.27737325502984783, "learning_rate": 3.3496667973343785e-05, "loss": 0.4601, "step": 1127 }, { "epoch": 1.1936507936507936, "grad_norm": 3.368075981212778, "learning_rate": 3.3477067816542534e-05, "loss": 0.4871, "step": 1128 }, { "epoch": 1.1947089947089946, "grad_norm": 0.3148955129252137, "learning_rate": 3.345746765974128e-05, "loss": 0.4603, "step": 1129 }, { "epoch": 1.1957671957671958, "grad_norm": 0.2533234823076518, "learning_rate": 3.3437867502940026e-05, "loss": 0.4263, "step": 1130 }, { "epoch": 1.1968253968253968, "grad_norm": 0.25463206395686244, "learning_rate": 3.341826734613877e-05, "loss": 0.452, "step": 1131 }, { "epoch": 1.1978835978835978, "grad_norm": 0.27879057651843764, "learning_rate": 3.339866718933752e-05, "loss": 0.4882, "step": 1132 }, { "epoch": 1.198941798941799, "grad_norm": 0.26539022543940605, "learning_rate": 3.337906703253626e-05, "loss": 0.4372, "step": 1133 }, { "epoch": 1.2, "grad_norm": 0.279151992029459, "learning_rate": 3.335946687573501e-05, "loss": 0.4505, "step": 1134 }, { "epoch": 1.201058201058201, "grad_norm": 0.3036270580949587, "learning_rate": 3.333986671893375e-05, "loss": 0.4974, "step": 1135 }, { "epoch": 1.2021164021164021, "grad_norm": 0.27528174866789473, "learning_rate": 3.3320266562132494e-05, "loss": 0.4994, "step": 1136 }, { "epoch": 1.2031746031746031, "grad_norm": 0.29856950983454983, "learning_rate": 3.3300666405331244e-05, "loss": 0.4534, "step": 1137 }, { "epoch": 1.2042328042328043, "grad_norm": 0.27683008135091847, "learning_rate": 3.328106624852999e-05, "loss": 0.4379, "step": 1138 }, { "epoch": 1.2052910052910053, "grad_norm": 0.26301228869162596, "learning_rate": 3.3261466091728735e-05, "loss": 0.4369, "step": 1139 }, { "epoch": 1.2063492063492063, "grad_norm": 0.4106472014462593, "learning_rate": 3.324186593492748e-05, "loss": 0.4226, "step": 1140 }, { "epoch": 1.2074074074074075, "grad_norm": 0.2843670121313921, "learning_rate": 3.322226577812623e-05, "loss": 0.4895, "step": 1141 }, { "epoch": 1.2084656084656085, "grad_norm": 0.31274125765332483, "learning_rate": 3.3202665621324976e-05, "loss": 0.5111, "step": 1142 }, { "epoch": 1.2095238095238094, "grad_norm": 0.34931031172201393, "learning_rate": 3.318306546452372e-05, "loss": 0.4436, "step": 1143 }, { "epoch": 1.2105820105820106, "grad_norm": 0.23718925178525013, "learning_rate": 3.316346530772246e-05, "loss": 0.3974, "step": 1144 }, { "epoch": 1.2116402116402116, "grad_norm": 0.2668299284915015, "learning_rate": 3.314386515092121e-05, "loss": 0.4508, "step": 1145 }, { "epoch": 1.2126984126984126, "grad_norm": 0.2801312984615186, "learning_rate": 3.312426499411995e-05, "loss": 0.4694, "step": 1146 }, { "epoch": 1.2137566137566138, "grad_norm": 0.24312099943011134, "learning_rate": 3.31046648373187e-05, "loss": 0.4546, "step": 1147 }, { "epoch": 1.2148148148148148, "grad_norm": 0.25424837283402324, "learning_rate": 3.3085064680517445e-05, "loss": 0.4747, "step": 1148 }, { "epoch": 1.215873015873016, "grad_norm": 0.2674867566361891, "learning_rate": 3.306546452371619e-05, "loss": 0.4574, "step": 1149 }, { "epoch": 1.216931216931217, "grad_norm": 0.2627651556600422, "learning_rate": 3.3045864366914937e-05, "loss": 0.4378, "step": 1150 }, { "epoch": 1.217989417989418, "grad_norm": 0.23716263801602822, "learning_rate": 3.3026264210113686e-05, "loss": 0.4161, "step": 1151 }, { "epoch": 1.2190476190476192, "grad_norm": 0.3138567642984657, "learning_rate": 3.300666405331243e-05, "loss": 0.4383, "step": 1152 }, { "epoch": 1.2201058201058201, "grad_norm": 0.276967866343535, "learning_rate": 3.298706389651117e-05, "loss": 0.4737, "step": 1153 }, { "epoch": 1.2211640211640211, "grad_norm": 0.2607801018967495, "learning_rate": 3.296746373970992e-05, "loss": 0.4962, "step": 1154 }, { "epoch": 1.2222222222222223, "grad_norm": 0.24637415714449645, "learning_rate": 3.294786358290867e-05, "loss": 0.4259, "step": 1155 }, { "epoch": 1.2232804232804233, "grad_norm": 0.25635788195924125, "learning_rate": 3.292826342610741e-05, "loss": 0.4851, "step": 1156 }, { "epoch": 1.2243386243386243, "grad_norm": 21.708465964907735, "learning_rate": 3.2908663269306154e-05, "loss": 0.5324, "step": 1157 }, { "epoch": 1.2253968253968255, "grad_norm": 0.2480738276020431, "learning_rate": 3.2889063112504904e-05, "loss": 0.4119, "step": 1158 }, { "epoch": 1.2264550264550265, "grad_norm": 0.2781677513733888, "learning_rate": 3.2869462955703646e-05, "loss": 0.5124, "step": 1159 }, { "epoch": 1.2275132275132274, "grad_norm": 0.20926116289505092, "learning_rate": 3.2849862798902395e-05, "loss": 0.3981, "step": 1160 }, { "epoch": 1.2285714285714286, "grad_norm": 0.23499136854110111, "learning_rate": 3.283026264210114e-05, "loss": 0.4563, "step": 1161 }, { "epoch": 1.2296296296296296, "grad_norm": 0.28418800988255766, "learning_rate": 3.281066248529989e-05, "loss": 0.4683, "step": 1162 }, { "epoch": 1.2306878306878306, "grad_norm": 0.5732237050386909, "learning_rate": 3.279106232849863e-05, "loss": 0.454, "step": 1163 }, { "epoch": 1.2317460317460318, "grad_norm": 0.29349759306686635, "learning_rate": 3.277146217169738e-05, "loss": 0.3962, "step": 1164 }, { "epoch": 1.2328042328042328, "grad_norm": 0.3073242580524234, "learning_rate": 3.275186201489612e-05, "loss": 0.4743, "step": 1165 }, { "epoch": 1.2338624338624338, "grad_norm": 0.2695714862968926, "learning_rate": 3.2732261858094864e-05, "loss": 0.4684, "step": 1166 }, { "epoch": 1.234920634920635, "grad_norm": 0.24526233061405303, "learning_rate": 3.271266170129361e-05, "loss": 0.4098, "step": 1167 }, { "epoch": 1.235978835978836, "grad_norm": 0.282480189507544, "learning_rate": 3.269306154449236e-05, "loss": 0.4649, "step": 1168 }, { "epoch": 1.237037037037037, "grad_norm": 0.28035139011097865, "learning_rate": 3.26734613876911e-05, "loss": 0.5027, "step": 1169 }, { "epoch": 1.2380952380952381, "grad_norm": 0.2291595625165518, "learning_rate": 3.265386123088985e-05, "loss": 0.3896, "step": 1170 }, { "epoch": 1.2391534391534391, "grad_norm": 0.2566644949612776, "learning_rate": 3.2634261074088597e-05, "loss": 0.4192, "step": 1171 }, { "epoch": 1.24021164021164, "grad_norm": 0.24470481325359464, "learning_rate": 3.261466091728734e-05, "loss": 0.4963, "step": 1172 }, { "epoch": 1.2412698412698413, "grad_norm": 0.24563820953102872, "learning_rate": 3.259506076048608e-05, "loss": 0.4184, "step": 1173 }, { "epoch": 1.2423280423280423, "grad_norm": 0.2347354332412375, "learning_rate": 3.257546060368483e-05, "loss": 0.4514, "step": 1174 }, { "epoch": 1.2433862433862433, "grad_norm": 2.958020037172032, "learning_rate": 3.255586044688358e-05, "loss": 0.5267, "step": 1175 }, { "epoch": 1.2444444444444445, "grad_norm": 0.2843310735518099, "learning_rate": 3.253626029008232e-05, "loss": 0.4906, "step": 1176 }, { "epoch": 1.2455026455026454, "grad_norm": 0.2542336692718839, "learning_rate": 3.2516660133281065e-05, "loss": 0.4443, "step": 1177 }, { "epoch": 1.2465608465608466, "grad_norm": 0.22674151401535145, "learning_rate": 3.2497059976479814e-05, "loss": 0.4132, "step": 1178 }, { "epoch": 1.2476190476190476, "grad_norm": 0.24724783616086354, "learning_rate": 3.247745981967856e-05, "loss": 0.4808, "step": 1179 }, { "epoch": 1.2486772486772486, "grad_norm": 0.24184869409515652, "learning_rate": 3.2457859662877306e-05, "loss": 0.4494, "step": 1180 }, { "epoch": 1.2497354497354498, "grad_norm": 0.23220783704881004, "learning_rate": 3.243825950607605e-05, "loss": 0.4236, "step": 1181 }, { "epoch": 1.2507936507936508, "grad_norm": 0.21779940633439762, "learning_rate": 3.241865934927479e-05, "loss": 0.3891, "step": 1182 }, { "epoch": 1.2518518518518518, "grad_norm": 0.22367557629440174, "learning_rate": 3.239905919247354e-05, "loss": 0.4615, "step": 1183 }, { "epoch": 1.252910052910053, "grad_norm": 0.24819272296895625, "learning_rate": 3.237945903567229e-05, "loss": 0.4737, "step": 1184 }, { "epoch": 1.253968253968254, "grad_norm": 0.21940462041140096, "learning_rate": 3.235985887887103e-05, "loss": 0.4358, "step": 1185 }, { "epoch": 1.2550264550264552, "grad_norm": 0.23333157969435228, "learning_rate": 3.2340258722069775e-05, "loss": 0.4305, "step": 1186 }, { "epoch": 1.2560846560846561, "grad_norm": 0.2878829627468074, "learning_rate": 3.2320658565268524e-05, "loss": 0.429, "step": 1187 }, { "epoch": 1.2571428571428571, "grad_norm": 0.24749463547096354, "learning_rate": 3.230105840846727e-05, "loss": 0.4438, "step": 1188 }, { "epoch": 1.2582010582010583, "grad_norm": 0.228540686736987, "learning_rate": 3.2281458251666016e-05, "loss": 0.4468, "step": 1189 }, { "epoch": 1.2592592592592593, "grad_norm": 0.2531138794775126, "learning_rate": 3.226185809486476e-05, "loss": 0.4759, "step": 1190 }, { "epoch": 1.2603174603174603, "grad_norm": 0.24125935658343106, "learning_rate": 3.224225793806351e-05, "loss": 0.4406, "step": 1191 }, { "epoch": 1.2613756613756615, "grad_norm": 0.25935447680403384, "learning_rate": 3.222265778126225e-05, "loss": 0.4795, "step": 1192 }, { "epoch": 1.2624338624338625, "grad_norm": 0.23939043891539874, "learning_rate": 3.2203057624461e-05, "loss": 0.4322, "step": 1193 }, { "epoch": 1.2634920634920634, "grad_norm": 0.23783030731256977, "learning_rate": 3.218345746765974e-05, "loss": 0.4159, "step": 1194 }, { "epoch": 1.2645502645502646, "grad_norm": 0.2801793027048429, "learning_rate": 3.2163857310858484e-05, "loss": 0.5096, "step": 1195 }, { "epoch": 1.2656084656084656, "grad_norm": 0.25672962049215936, "learning_rate": 3.214425715405723e-05, "loss": 0.4475, "step": 1196 }, { "epoch": 1.2666666666666666, "grad_norm": 2.98727547896751, "learning_rate": 3.212465699725598e-05, "loss": 0.5605, "step": 1197 }, { "epoch": 1.2677248677248678, "grad_norm": 0.31486994948060426, "learning_rate": 3.2105056840454725e-05, "loss": 0.5239, "step": 1198 }, { "epoch": 1.2687830687830688, "grad_norm": 0.2651388574204949, "learning_rate": 3.208545668365347e-05, "loss": 0.4854, "step": 1199 }, { "epoch": 1.2698412698412698, "grad_norm": 0.23407941410701735, "learning_rate": 3.206585652685222e-05, "loss": 0.4403, "step": 1200 }, { "epoch": 1.270899470899471, "grad_norm": 0.25844549625625296, "learning_rate": 3.2046256370050966e-05, "loss": 0.4974, "step": 1201 }, { "epoch": 1.271957671957672, "grad_norm": 0.357313584767075, "learning_rate": 3.202665621324971e-05, "loss": 0.48, "step": 1202 }, { "epoch": 1.273015873015873, "grad_norm": 0.2498660944411242, "learning_rate": 3.200705605644845e-05, "loss": 0.4524, "step": 1203 }, { "epoch": 1.2740740740740741, "grad_norm": 0.2296750952608947, "learning_rate": 3.19874558996472e-05, "loss": 0.4473, "step": 1204 }, { "epoch": 1.2751322751322751, "grad_norm": 0.23977764063922036, "learning_rate": 3.196785574284594e-05, "loss": 0.4306, "step": 1205 }, { "epoch": 1.276190476190476, "grad_norm": 0.25984180898265835, "learning_rate": 3.194825558604469e-05, "loss": 0.4463, "step": 1206 }, { "epoch": 1.2772486772486773, "grad_norm": 0.2205818526717538, "learning_rate": 3.1928655429243434e-05, "loss": 0.4162, "step": 1207 }, { "epoch": 1.2783068783068783, "grad_norm": 0.20685381181973775, "learning_rate": 3.190905527244218e-05, "loss": 0.4067, "step": 1208 }, { "epoch": 1.2793650793650793, "grad_norm": 0.2622315576719718, "learning_rate": 3.1889455115640926e-05, "loss": 0.4535, "step": 1209 }, { "epoch": 1.2804232804232805, "grad_norm": 0.27562631291061185, "learning_rate": 3.1869854958839675e-05, "loss": 0.5411, "step": 1210 }, { "epoch": 1.2814814814814814, "grad_norm": 0.2436787808592762, "learning_rate": 3.185025480203842e-05, "loss": 0.4231, "step": 1211 }, { "epoch": 1.2825396825396824, "grad_norm": 0.26452961096327726, "learning_rate": 3.183065464523716e-05, "loss": 0.4682, "step": 1212 }, { "epoch": 1.2835978835978836, "grad_norm": 0.2561065853683004, "learning_rate": 3.181105448843591e-05, "loss": 0.4672, "step": 1213 }, { "epoch": 1.2846560846560846, "grad_norm": 0.23228316782055128, "learning_rate": 3.179145433163466e-05, "loss": 0.4774, "step": 1214 }, { "epoch": 1.2857142857142856, "grad_norm": 2.661626366284025, "learning_rate": 3.17718541748334e-05, "loss": 0.492, "step": 1215 }, { "epoch": 1.2867724867724868, "grad_norm": 0.2550139339573797, "learning_rate": 3.1752254018032144e-05, "loss": 0.393, "step": 1216 }, { "epoch": 1.2878306878306878, "grad_norm": 0.2982626987372917, "learning_rate": 3.173265386123089e-05, "loss": 0.4337, "step": 1217 }, { "epoch": 1.2888888888888888, "grad_norm": 0.28057911093701576, "learning_rate": 3.1713053704429636e-05, "loss": 0.4806, "step": 1218 }, { "epoch": 1.28994708994709, "grad_norm": 0.4561185589253618, "learning_rate": 3.1693453547628385e-05, "loss": 0.4365, "step": 1219 }, { "epoch": 1.291005291005291, "grad_norm": 0.29263696706005893, "learning_rate": 3.167385339082713e-05, "loss": 0.4656, "step": 1220 }, { "epoch": 1.2920634920634921, "grad_norm": 0.27093691099606854, "learning_rate": 3.165425323402587e-05, "loss": 0.4962, "step": 1221 }, { "epoch": 1.2931216931216931, "grad_norm": 0.28064868068280735, "learning_rate": 3.163465307722462e-05, "loss": 0.4068, "step": 1222 }, { "epoch": 1.294179894179894, "grad_norm": 0.3148596581856991, "learning_rate": 3.161505292042337e-05, "loss": 0.4815, "step": 1223 }, { "epoch": 1.2952380952380953, "grad_norm": 0.7434992980537427, "learning_rate": 3.159545276362211e-05, "loss": 0.5237, "step": 1224 }, { "epoch": 1.2962962962962963, "grad_norm": 0.27981974942403515, "learning_rate": 3.1575852606820853e-05, "loss": 0.4476, "step": 1225 }, { "epoch": 1.2973544973544975, "grad_norm": 0.30917000871173816, "learning_rate": 3.15562524500196e-05, "loss": 0.433, "step": 1226 }, { "epoch": 1.2984126984126985, "grad_norm": 0.2756768577076242, "learning_rate": 3.153665229321835e-05, "loss": 0.4463, "step": 1227 }, { "epoch": 1.2994708994708994, "grad_norm": 0.29298489584722304, "learning_rate": 3.1517052136417094e-05, "loss": 0.4787, "step": 1228 }, { "epoch": 1.3005291005291006, "grad_norm": 0.23814302832789452, "learning_rate": 3.149745197961584e-05, "loss": 0.4115, "step": 1229 }, { "epoch": 1.3015873015873016, "grad_norm": 0.3031186340550912, "learning_rate": 3.1477851822814586e-05, "loss": 0.4138, "step": 1230 }, { "epoch": 1.3026455026455026, "grad_norm": 0.28701409757098467, "learning_rate": 3.145825166601333e-05, "loss": 0.4445, "step": 1231 }, { "epoch": 1.3037037037037038, "grad_norm": 0.24322652680335896, "learning_rate": 3.143865150921208e-05, "loss": 0.4955, "step": 1232 }, { "epoch": 1.3047619047619048, "grad_norm": 0.2672959117899414, "learning_rate": 3.141905135241082e-05, "loss": 0.495, "step": 1233 }, { "epoch": 1.3058201058201058, "grad_norm": 0.25188935994394157, "learning_rate": 3.139945119560956e-05, "loss": 0.4609, "step": 1234 }, { "epoch": 1.306878306878307, "grad_norm": 0.24955613360702394, "learning_rate": 3.137985103880831e-05, "loss": 0.4466, "step": 1235 }, { "epoch": 1.307936507936508, "grad_norm": 0.23518682524284248, "learning_rate": 3.136025088200706e-05, "loss": 0.3943, "step": 1236 }, { "epoch": 1.308994708994709, "grad_norm": 0.23720969598849823, "learning_rate": 3.1340650725205804e-05, "loss": 0.5164, "step": 1237 }, { "epoch": 1.3100529100529101, "grad_norm": 0.21537376471764513, "learning_rate": 3.1321050568404546e-05, "loss": 0.4002, "step": 1238 }, { "epoch": 1.3111111111111111, "grad_norm": 1.6860157051017535, "learning_rate": 3.1301450411603296e-05, "loss": 0.4859, "step": 1239 }, { "epoch": 1.312169312169312, "grad_norm": 0.2651741235611922, "learning_rate": 3.1281850254802045e-05, "loss": 0.421, "step": 1240 }, { "epoch": 1.3132275132275133, "grad_norm": 0.23836910046795176, "learning_rate": 3.126225009800078e-05, "loss": 0.4484, "step": 1241 }, { "epoch": 1.3142857142857143, "grad_norm": 0.34511017850329395, "learning_rate": 3.124264994119953e-05, "loss": 0.4587, "step": 1242 }, { "epoch": 1.3153439153439153, "grad_norm": 0.3025522362899887, "learning_rate": 3.122304978439828e-05, "loss": 0.4275, "step": 1243 }, { "epoch": 1.3164021164021165, "grad_norm": 0.247152508905285, "learning_rate": 3.120344962759702e-05, "loss": 0.4214, "step": 1244 }, { "epoch": 1.3174603174603174, "grad_norm": 0.25398227298952686, "learning_rate": 3.1183849470795764e-05, "loss": 0.4581, "step": 1245 }, { "epoch": 1.3185185185185184, "grad_norm": 0.22633308872087934, "learning_rate": 3.1164249313994513e-05, "loss": 0.4584, "step": 1246 }, { "epoch": 1.3195767195767196, "grad_norm": 0.24210217710444076, "learning_rate": 3.1144649157193256e-05, "loss": 0.4206, "step": 1247 }, { "epoch": 1.3206349206349206, "grad_norm": 0.23335804933192775, "learning_rate": 3.1125049000392005e-05, "loss": 0.4248, "step": 1248 }, { "epoch": 1.3216931216931216, "grad_norm": 0.24164931346146493, "learning_rate": 3.110544884359075e-05, "loss": 0.4013, "step": 1249 }, { "epoch": 1.3227513227513228, "grad_norm": 1.2954500627805814, "learning_rate": 3.10858486867895e-05, "loss": 0.5315, "step": 1250 }, { "epoch": 1.3238095238095238, "grad_norm": 0.23445766500023593, "learning_rate": 3.106624852998824e-05, "loss": 0.4013, "step": 1251 }, { "epoch": 1.3248677248677247, "grad_norm": 0.24783454283131148, "learning_rate": 3.104664837318699e-05, "loss": 0.4648, "step": 1252 }, { "epoch": 1.325925925925926, "grad_norm": 0.2659873598285454, "learning_rate": 3.102704821638574e-05, "loss": 0.4992, "step": 1253 }, { "epoch": 1.326984126984127, "grad_norm": 0.6888495280810113, "learning_rate": 3.1007448059584474e-05, "loss": 0.4971, "step": 1254 }, { "epoch": 1.328042328042328, "grad_norm": 0.24722561577784258, "learning_rate": 3.098784790278322e-05, "loss": 0.4246, "step": 1255 }, { "epoch": 1.3291005291005291, "grad_norm": 0.27176108829651746, "learning_rate": 3.096824774598197e-05, "loss": 0.493, "step": 1256 }, { "epoch": 1.33015873015873, "grad_norm": 0.2922219870244475, "learning_rate": 3.0948647589180715e-05, "loss": 0.4643, "step": 1257 }, { "epoch": 1.3312169312169313, "grad_norm": 0.2495354668993547, "learning_rate": 3.092904743237946e-05, "loss": 0.4807, "step": 1258 }, { "epoch": 1.3322751322751323, "grad_norm": 0.25416175375930267, "learning_rate": 3.0909447275578206e-05, "loss": 0.4559, "step": 1259 }, { "epoch": 1.3333333333333333, "grad_norm": 0.2505309654320362, "learning_rate": 3.088984711877695e-05, "loss": 0.4175, "step": 1260 }, { "epoch": 1.3343915343915345, "grad_norm": 0.22168988059391945, "learning_rate": 3.08702469619757e-05, "loss": 0.3638, "step": 1261 }, { "epoch": 1.3354497354497354, "grad_norm": 0.2088949121467012, "learning_rate": 3.085064680517444e-05, "loss": 0.3575, "step": 1262 }, { "epoch": 1.3365079365079366, "grad_norm": 0.26660498435295343, "learning_rate": 3.083104664837319e-05, "loss": 0.4729, "step": 1263 }, { "epoch": 1.3375661375661376, "grad_norm": 0.24658883170058563, "learning_rate": 3.081144649157193e-05, "loss": 0.4545, "step": 1264 }, { "epoch": 1.3386243386243386, "grad_norm": 0.2236745034029776, "learning_rate": 3.079184633477068e-05, "loss": 0.4503, "step": 1265 }, { "epoch": 1.3396825396825398, "grad_norm": 0.24135305732779005, "learning_rate": 3.0772246177969424e-05, "loss": 0.4874, "step": 1266 }, { "epoch": 1.3407407407407408, "grad_norm": 0.3256043618189639, "learning_rate": 3.075264602116817e-05, "loss": 0.5231, "step": 1267 }, { "epoch": 1.3417989417989418, "grad_norm": 0.23737346236972562, "learning_rate": 3.0733045864366916e-05, "loss": 0.4316, "step": 1268 }, { "epoch": 1.342857142857143, "grad_norm": 0.23544854426751335, "learning_rate": 3.0713445707565665e-05, "loss": 0.5169, "step": 1269 }, { "epoch": 1.343915343915344, "grad_norm": 1.659064568731437, "learning_rate": 3.069384555076441e-05, "loss": 0.4348, "step": 1270 }, { "epoch": 1.344973544973545, "grad_norm": 0.2534937144399376, "learning_rate": 3.067424539396315e-05, "loss": 0.4652, "step": 1271 }, { "epoch": 1.3460317460317461, "grad_norm": 0.23526593221138056, "learning_rate": 3.06546452371619e-05, "loss": 0.4616, "step": 1272 }, { "epoch": 1.3470899470899471, "grad_norm": 0.23742609091689737, "learning_rate": 3.063504508036065e-05, "loss": 0.4369, "step": 1273 }, { "epoch": 1.348148148148148, "grad_norm": 0.2222447404589058, "learning_rate": 3.061544492355939e-05, "loss": 0.4124, "step": 1274 }, { "epoch": 1.3492063492063493, "grad_norm": 0.29184250980945264, "learning_rate": 3.0595844766758134e-05, "loss": 0.4454, "step": 1275 }, { "epoch": 1.3502645502645503, "grad_norm": 0.2349625400452902, "learning_rate": 3.057624460995688e-05, "loss": 0.4706, "step": 1276 }, { "epoch": 1.3513227513227513, "grad_norm": 0.21431852615283079, "learning_rate": 3.0556644453155625e-05, "loss": 0.4258, "step": 1277 }, { "epoch": 1.3523809523809525, "grad_norm": 0.21372092688801175, "learning_rate": 3.0537044296354375e-05, "loss": 0.4199, "step": 1278 }, { "epoch": 1.3534391534391534, "grad_norm": 0.2579456014466511, "learning_rate": 3.051744413955312e-05, "loss": 0.4978, "step": 1279 }, { "epoch": 1.3544973544973544, "grad_norm": 0.2507941348475173, "learning_rate": 3.0497843982751863e-05, "loss": 0.4665, "step": 1280 }, { "epoch": 1.3555555555555556, "grad_norm": 0.232129817590424, "learning_rate": 3.047824382595061e-05, "loss": 0.456, "step": 1281 }, { "epoch": 1.3566137566137566, "grad_norm": 0.2514526079081348, "learning_rate": 3.0458643669149355e-05, "loss": 0.4404, "step": 1282 }, { "epoch": 1.3576719576719576, "grad_norm": 0.2572184939136445, "learning_rate": 3.0439043512348097e-05, "loss": 0.5318, "step": 1283 }, { "epoch": 1.3587301587301588, "grad_norm": 0.4039307927131717, "learning_rate": 3.0419443355546846e-05, "loss": 0.3901, "step": 1284 }, { "epoch": 1.3597883597883598, "grad_norm": 0.24715025836977536, "learning_rate": 3.0399843198745592e-05, "loss": 0.4698, "step": 1285 }, { "epoch": 1.3608465608465607, "grad_norm": 0.23085610616983096, "learning_rate": 3.0380243041944338e-05, "loss": 0.4939, "step": 1286 }, { "epoch": 1.361904761904762, "grad_norm": 0.2364958065579408, "learning_rate": 3.036064288514308e-05, "loss": 0.4244, "step": 1287 }, { "epoch": 1.362962962962963, "grad_norm": 0.25662478506333875, "learning_rate": 3.034104272834183e-05, "loss": 0.4692, "step": 1288 }, { "epoch": 1.364021164021164, "grad_norm": 0.23895684567679154, "learning_rate": 3.0321442571540576e-05, "loss": 0.4296, "step": 1289 }, { "epoch": 1.3650793650793651, "grad_norm": 0.2440970628813272, "learning_rate": 3.030184241473932e-05, "loss": 0.4288, "step": 1290 }, { "epoch": 1.366137566137566, "grad_norm": 0.22375993243823888, "learning_rate": 3.0282242257938064e-05, "loss": 0.4409, "step": 1291 }, { "epoch": 1.367195767195767, "grad_norm": 0.2213878463198946, "learning_rate": 3.0262642101136813e-05, "loss": 0.4122, "step": 1292 }, { "epoch": 1.3682539682539683, "grad_norm": 0.23752017382355897, "learning_rate": 3.0243041944335553e-05, "loss": 0.4666, "step": 1293 }, { "epoch": 1.3693121693121693, "grad_norm": 0.25742210373427815, "learning_rate": 3.0223441787534302e-05, "loss": 0.4956, "step": 1294 }, { "epoch": 1.3703703703703702, "grad_norm": 0.234897224069936, "learning_rate": 3.0203841630733048e-05, "loss": 0.4467, "step": 1295 }, { "epoch": 1.3714285714285714, "grad_norm": 0.23263694871443225, "learning_rate": 3.018424147393179e-05, "loss": 0.4795, "step": 1296 }, { "epoch": 1.3724867724867724, "grad_norm": 0.2712687227877605, "learning_rate": 3.0164641317130536e-05, "loss": 0.5024, "step": 1297 }, { "epoch": 1.3735449735449736, "grad_norm": 0.22372235767515633, "learning_rate": 3.0145041160329285e-05, "loss": 0.48, "step": 1298 }, { "epoch": 1.3746031746031746, "grad_norm": 0.2406938475308407, "learning_rate": 3.012544100352803e-05, "loss": 0.4314, "step": 1299 }, { "epoch": 1.3756613756613756, "grad_norm": 0.21389054293693965, "learning_rate": 3.0105840846726774e-05, "loss": 0.4229, "step": 1300 }, { "epoch": 1.3767195767195768, "grad_norm": 0.23149026826112992, "learning_rate": 3.008624068992552e-05, "loss": 0.4707, "step": 1301 }, { "epoch": 1.3777777777777778, "grad_norm": 0.2691143078356963, "learning_rate": 3.006664053312427e-05, "loss": 0.4282, "step": 1302 }, { "epoch": 1.378835978835979, "grad_norm": 0.20755788867950342, "learning_rate": 3.004704037632301e-05, "loss": 0.3924, "step": 1303 }, { "epoch": 1.37989417989418, "grad_norm": 0.23052600055188638, "learning_rate": 3.0027440219521757e-05, "loss": 0.4129, "step": 1304 }, { "epoch": 1.380952380952381, "grad_norm": 0.22668631216324492, "learning_rate": 3.0007840062720503e-05, "loss": 0.4708, "step": 1305 }, { "epoch": 1.3820105820105821, "grad_norm": 0.2660877251492311, "learning_rate": 2.9988239905919246e-05, "loss": 0.5037, "step": 1306 }, { "epoch": 1.3830687830687831, "grad_norm": 0.22823512577697444, "learning_rate": 2.9968639749117995e-05, "loss": 0.4464, "step": 1307 }, { "epoch": 1.384126984126984, "grad_norm": 0.24777055372668572, "learning_rate": 2.994903959231674e-05, "loss": 0.5314, "step": 1308 }, { "epoch": 1.3851851851851853, "grad_norm": 0.23660719349812223, "learning_rate": 2.9929439435515483e-05, "loss": 0.4372, "step": 1309 }, { "epoch": 1.3862433862433863, "grad_norm": 0.25379316726262235, "learning_rate": 2.990983927871423e-05, "loss": 0.4353, "step": 1310 }, { "epoch": 1.3873015873015873, "grad_norm": 0.25004667495021365, "learning_rate": 2.989023912191298e-05, "loss": 0.4608, "step": 1311 }, { "epoch": 1.3883597883597885, "grad_norm": 0.22311146023995984, "learning_rate": 2.9870638965111724e-05, "loss": 0.4398, "step": 1312 }, { "epoch": 1.3894179894179894, "grad_norm": 0.2557529215124851, "learning_rate": 2.9851038808310467e-05, "loss": 0.5005, "step": 1313 }, { "epoch": 1.3904761904761904, "grad_norm": 0.219279598189162, "learning_rate": 2.9831438651509213e-05, "loss": 0.4129, "step": 1314 }, { "epoch": 1.3915343915343916, "grad_norm": 0.2269485021477995, "learning_rate": 2.9811838494707962e-05, "loss": 0.4145, "step": 1315 }, { "epoch": 1.3925925925925926, "grad_norm": 0.2331786311014036, "learning_rate": 2.9792238337906704e-05, "loss": 0.42, "step": 1316 }, { "epoch": 1.3936507936507936, "grad_norm": 0.31888841203624635, "learning_rate": 2.977263818110545e-05, "loss": 0.4383, "step": 1317 }, { "epoch": 1.3947089947089948, "grad_norm": 0.212841055960245, "learning_rate": 2.9753038024304196e-05, "loss": 0.4061, "step": 1318 }, { "epoch": 1.3957671957671958, "grad_norm": 0.24247705953739102, "learning_rate": 2.973343786750294e-05, "loss": 0.4658, "step": 1319 }, { "epoch": 1.3968253968253967, "grad_norm": 0.30703930058231754, "learning_rate": 2.9713837710701688e-05, "loss": 0.3601, "step": 1320 }, { "epoch": 1.397883597883598, "grad_norm": 0.23472988733850123, "learning_rate": 2.9694237553900434e-05, "loss": 0.4945, "step": 1321 }, { "epoch": 1.398941798941799, "grad_norm": 0.30870145935773974, "learning_rate": 2.967463739709918e-05, "loss": 0.4775, "step": 1322 }, { "epoch": 1.4, "grad_norm": 0.22196743152262746, "learning_rate": 2.9655037240297922e-05, "loss": 0.457, "step": 1323 }, { "epoch": 1.4010582010582011, "grad_norm": 0.23880933160360443, "learning_rate": 2.963543708349667e-05, "loss": 0.4267, "step": 1324 }, { "epoch": 1.402116402116402, "grad_norm": 0.20702662843283434, "learning_rate": 2.9615836926695417e-05, "loss": 0.441, "step": 1325 }, { "epoch": 1.403174603174603, "grad_norm": 0.2167350518937471, "learning_rate": 2.959623676989416e-05, "loss": 0.4408, "step": 1326 }, { "epoch": 1.4042328042328043, "grad_norm": 0.24089833902821953, "learning_rate": 2.9576636613092906e-05, "loss": 0.4968, "step": 1327 }, { "epoch": 1.4052910052910053, "grad_norm": 0.2241238478376523, "learning_rate": 2.9557036456291655e-05, "loss": 0.4517, "step": 1328 }, { "epoch": 1.4063492063492062, "grad_norm": 0.22856799607317055, "learning_rate": 2.9537436299490394e-05, "loss": 0.4579, "step": 1329 }, { "epoch": 1.4074074074074074, "grad_norm": 0.2268187851367138, "learning_rate": 2.9517836142689143e-05, "loss": 0.4649, "step": 1330 }, { "epoch": 1.4084656084656084, "grad_norm": 0.22233549147880394, "learning_rate": 2.949823598588789e-05, "loss": 0.4465, "step": 1331 }, { "epoch": 1.4095238095238094, "grad_norm": 0.2236855488492212, "learning_rate": 2.947863582908663e-05, "loss": 0.4228, "step": 1332 }, { "epoch": 1.4105820105820106, "grad_norm": 0.23801323711217828, "learning_rate": 2.9459035672285377e-05, "loss": 0.4252, "step": 1333 }, { "epoch": 1.4116402116402116, "grad_norm": 0.24798530897647444, "learning_rate": 2.9439435515484127e-05, "loss": 0.5064, "step": 1334 }, { "epoch": 1.4126984126984126, "grad_norm": 0.23941337695722817, "learning_rate": 2.9419835358682873e-05, "loss": 0.4679, "step": 1335 }, { "epoch": 1.4137566137566138, "grad_norm": 0.2623716098058726, "learning_rate": 2.9400235201881615e-05, "loss": 0.4473, "step": 1336 }, { "epoch": 1.4148148148148147, "grad_norm": 0.21199389424437273, "learning_rate": 2.938063504508036e-05, "loss": 0.4225, "step": 1337 }, { "epoch": 1.415873015873016, "grad_norm": 0.21541116818374437, "learning_rate": 2.936103488827911e-05, "loss": 0.4108, "step": 1338 }, { "epoch": 1.416931216931217, "grad_norm": 0.24853955951034445, "learning_rate": 2.9341434731477853e-05, "loss": 0.447, "step": 1339 }, { "epoch": 1.417989417989418, "grad_norm": 0.2215945031478815, "learning_rate": 2.93218345746766e-05, "loss": 0.4761, "step": 1340 }, { "epoch": 1.4190476190476191, "grad_norm": 0.2133339228199975, "learning_rate": 2.9302234417875344e-05, "loss": 0.4524, "step": 1341 }, { "epoch": 1.42010582010582, "grad_norm": 0.24231327015852916, "learning_rate": 2.9282634261074087e-05, "loss": 0.4629, "step": 1342 }, { "epoch": 1.4211640211640213, "grad_norm": 0.21685366161636507, "learning_rate": 2.9263034104272836e-05, "loss": 0.4234, "step": 1343 }, { "epoch": 1.4222222222222223, "grad_norm": 0.22253006527800145, "learning_rate": 2.9243433947471582e-05, "loss": 0.4323, "step": 1344 }, { "epoch": 1.4232804232804233, "grad_norm": 0.22176071929381708, "learning_rate": 2.9223833790670325e-05, "loss": 0.4427, "step": 1345 }, { "epoch": 1.4243386243386245, "grad_norm": 0.24232602640124465, "learning_rate": 2.920423363386907e-05, "loss": 0.3828, "step": 1346 }, { "epoch": 1.4253968253968254, "grad_norm": 0.22581122208104695, "learning_rate": 2.918463347706782e-05, "loss": 0.4805, "step": 1347 }, { "epoch": 1.4264550264550264, "grad_norm": 0.20026012333803486, "learning_rate": 2.9165033320266566e-05, "loss": 0.3919, "step": 1348 }, { "epoch": 1.4275132275132276, "grad_norm": 0.22333787876129188, "learning_rate": 2.9145433163465308e-05, "loss": 0.4131, "step": 1349 }, { "epoch": 1.4285714285714286, "grad_norm": 0.2687484229021416, "learning_rate": 2.9125833006664054e-05, "loss": 0.5264, "step": 1350 }, { "epoch": 1.4296296296296296, "grad_norm": 0.2103685914143013, "learning_rate": 2.9106232849862803e-05, "loss": 0.4576, "step": 1351 }, { "epoch": 1.4306878306878308, "grad_norm": 0.21771460457762667, "learning_rate": 2.9086632693061546e-05, "loss": 0.4361, "step": 1352 }, { "epoch": 1.4317460317460318, "grad_norm": 0.20303241796561644, "learning_rate": 2.906703253626029e-05, "loss": 0.3971, "step": 1353 }, { "epoch": 1.4328042328042327, "grad_norm": 0.20143577073807975, "learning_rate": 2.9047432379459037e-05, "loss": 0.3797, "step": 1354 }, { "epoch": 1.433862433862434, "grad_norm": 0.2329009387387878, "learning_rate": 2.902783222265778e-05, "loss": 0.506, "step": 1355 }, { "epoch": 1.434920634920635, "grad_norm": 0.2702723280262594, "learning_rate": 2.900823206585653e-05, "loss": 0.3775, "step": 1356 }, { "epoch": 1.435978835978836, "grad_norm": 0.2834676972961095, "learning_rate": 2.8988631909055275e-05, "loss": 0.462, "step": 1357 }, { "epoch": 1.4370370370370371, "grad_norm": 0.23065496781142314, "learning_rate": 2.8969031752254017e-05, "loss": 0.4994, "step": 1358 }, { "epoch": 1.438095238095238, "grad_norm": 0.22768658403107855, "learning_rate": 2.8949431595452763e-05, "loss": 0.4218, "step": 1359 }, { "epoch": 1.439153439153439, "grad_norm": 0.2680457798964453, "learning_rate": 2.8929831438651513e-05, "loss": 0.4527, "step": 1360 }, { "epoch": 1.4402116402116403, "grad_norm": 0.2275408510504903, "learning_rate": 2.891023128185026e-05, "loss": 0.4718, "step": 1361 }, { "epoch": 1.4412698412698413, "grad_norm": 0.23860010786360963, "learning_rate": 2.8890631125049e-05, "loss": 0.4656, "step": 1362 }, { "epoch": 1.4423280423280422, "grad_norm": 0.5143958846694693, "learning_rate": 2.8871030968247747e-05, "loss": 0.4567, "step": 1363 }, { "epoch": 1.4433862433862434, "grad_norm": 0.23216914200826835, "learning_rate": 2.8851430811446496e-05, "loss": 0.4873, "step": 1364 }, { "epoch": 1.4444444444444444, "grad_norm": 0.25213100827470797, "learning_rate": 2.8831830654645235e-05, "loss": 0.4681, "step": 1365 }, { "epoch": 1.4455026455026454, "grad_norm": 0.22446515285108198, "learning_rate": 2.8812230497843984e-05, "loss": 0.3967, "step": 1366 }, { "epoch": 1.4465608465608466, "grad_norm": 0.22388321159591337, "learning_rate": 2.879263034104273e-05, "loss": 0.409, "step": 1367 }, { "epoch": 1.4476190476190476, "grad_norm": 0.22054153685138833, "learning_rate": 2.8773030184241473e-05, "loss": 0.4393, "step": 1368 }, { "epoch": 1.4486772486772486, "grad_norm": 0.23961761676424165, "learning_rate": 2.875343002744022e-05, "loss": 0.416, "step": 1369 }, { "epoch": 1.4497354497354498, "grad_norm": 0.24967858263135762, "learning_rate": 2.8733829870638968e-05, "loss": 0.4897, "step": 1370 }, { "epoch": 1.4507936507936507, "grad_norm": 0.23131162096590446, "learning_rate": 2.8714229713837714e-05, "loss": 0.4876, "step": 1371 }, { "epoch": 1.4518518518518517, "grad_norm": 0.9779701550723804, "learning_rate": 2.8694629557036456e-05, "loss": 0.4304, "step": 1372 }, { "epoch": 1.452910052910053, "grad_norm": 0.25630422299181627, "learning_rate": 2.8675029400235202e-05, "loss": 0.4348, "step": 1373 }, { "epoch": 1.453968253968254, "grad_norm": 0.25784690293324475, "learning_rate": 2.865542924343395e-05, "loss": 0.4629, "step": 1374 }, { "epoch": 1.455026455026455, "grad_norm": 0.27761793043496447, "learning_rate": 2.8635829086632694e-05, "loss": 0.4485, "step": 1375 }, { "epoch": 1.456084656084656, "grad_norm": 0.26282171909723734, "learning_rate": 2.861622892983144e-05, "loss": 0.4981, "step": 1376 }, { "epoch": 1.457142857142857, "grad_norm": 0.7366784020256089, "learning_rate": 2.8596628773030186e-05, "loss": 0.4384, "step": 1377 }, { "epoch": 1.4582010582010583, "grad_norm": 0.23451220676925627, "learning_rate": 2.8577028616228928e-05, "loss": 0.4342, "step": 1378 }, { "epoch": 1.4592592592592593, "grad_norm": 0.2232768861934982, "learning_rate": 2.8557428459427677e-05, "loss": 0.4122, "step": 1379 }, { "epoch": 1.4603174603174602, "grad_norm": 0.2330751792602911, "learning_rate": 2.8537828302626423e-05, "loss": 0.4716, "step": 1380 }, { "epoch": 1.4613756613756614, "grad_norm": 0.2363839837673862, "learning_rate": 2.8518228145825166e-05, "loss": 0.4455, "step": 1381 }, { "epoch": 1.4624338624338624, "grad_norm": 0.22199158535950497, "learning_rate": 2.8498627989023912e-05, "loss": 0.4017, "step": 1382 }, { "epoch": 1.4634920634920636, "grad_norm": 0.2140913960403465, "learning_rate": 2.847902783222266e-05, "loss": 0.4171, "step": 1383 }, { "epoch": 1.4645502645502646, "grad_norm": 0.21942455553450194, "learning_rate": 2.8459427675421407e-05, "loss": 0.4345, "step": 1384 }, { "epoch": 1.4656084656084656, "grad_norm": 0.2589886529373693, "learning_rate": 2.843982751862015e-05, "loss": 0.4751, "step": 1385 }, { "epoch": 1.4666666666666668, "grad_norm": 0.2161806830799717, "learning_rate": 2.8420227361818895e-05, "loss": 0.449, "step": 1386 }, { "epoch": 1.4677248677248678, "grad_norm": 0.2155045931577355, "learning_rate": 2.8400627205017644e-05, "loss": 0.3958, "step": 1387 }, { "epoch": 1.4687830687830687, "grad_norm": 34.14419147235214, "learning_rate": 2.8381027048216387e-05, "loss": 0.931, "step": 1388 }, { "epoch": 1.46984126984127, "grad_norm": 0.2349647452936981, "learning_rate": 2.8361426891415133e-05, "loss": 0.4205, "step": 1389 }, { "epoch": 1.470899470899471, "grad_norm": 0.4986074723963525, "learning_rate": 2.834182673461388e-05, "loss": 0.4416, "step": 1390 }, { "epoch": 1.471957671957672, "grad_norm": 0.23543109220326247, "learning_rate": 2.832222657781262e-05, "loss": 0.4605, "step": 1391 }, { "epoch": 1.4730158730158731, "grad_norm": 0.2041343822972172, "learning_rate": 2.830262642101137e-05, "loss": 0.4152, "step": 1392 }, { "epoch": 1.474074074074074, "grad_norm": 0.22858488741114016, "learning_rate": 2.8283026264210116e-05, "loss": 0.4099, "step": 1393 }, { "epoch": 1.475132275132275, "grad_norm": 0.2583668529158323, "learning_rate": 2.826342610740886e-05, "loss": 0.4569, "step": 1394 }, { "epoch": 1.4761904761904763, "grad_norm": 0.2110916980288495, "learning_rate": 2.8243825950607605e-05, "loss": 0.4281, "step": 1395 }, { "epoch": 1.4772486772486773, "grad_norm": 0.2335971618684065, "learning_rate": 2.8224225793806354e-05, "loss": 0.4363, "step": 1396 }, { "epoch": 1.4783068783068782, "grad_norm": 0.24852243373430788, "learning_rate": 2.82046256370051e-05, "loss": 0.4723, "step": 1397 }, { "epoch": 1.4793650793650794, "grad_norm": 1.7834274229889742, "learning_rate": 2.8185025480203842e-05, "loss": 0.4434, "step": 1398 }, { "epoch": 1.4804232804232804, "grad_norm": 0.24286885099131655, "learning_rate": 2.8165425323402588e-05, "loss": 0.4564, "step": 1399 }, { "epoch": 1.4814814814814814, "grad_norm": 0.2609544012499301, "learning_rate": 2.8145825166601337e-05, "loss": 0.4881, "step": 1400 }, { "epoch": 1.4825396825396826, "grad_norm": 0.2078688963633995, "learning_rate": 2.8126225009800077e-05, "loss": 0.4035, "step": 1401 }, { "epoch": 1.4835978835978836, "grad_norm": 0.2183047037176754, "learning_rate": 2.8106624852998826e-05, "loss": 0.4606, "step": 1402 }, { "epoch": 1.4846560846560846, "grad_norm": 0.24125277886676127, "learning_rate": 2.808702469619757e-05, "loss": 0.3907, "step": 1403 }, { "epoch": 1.4857142857142858, "grad_norm": 0.21563203906268566, "learning_rate": 2.8067424539396314e-05, "loss": 0.3794, "step": 1404 }, { "epoch": 1.4867724867724867, "grad_norm": 0.23564668760181676, "learning_rate": 2.804782438259506e-05, "loss": 0.3883, "step": 1405 }, { "epoch": 1.4878306878306877, "grad_norm": 0.22700381452133556, "learning_rate": 2.802822422579381e-05, "loss": 0.4516, "step": 1406 }, { "epoch": 1.488888888888889, "grad_norm": 0.21734610674555102, "learning_rate": 2.8008624068992552e-05, "loss": 0.4437, "step": 1407 }, { "epoch": 1.48994708994709, "grad_norm": 0.23992886346228195, "learning_rate": 2.7989023912191298e-05, "loss": 0.4216, "step": 1408 }, { "epoch": 1.491005291005291, "grad_norm": 0.2628261185467104, "learning_rate": 2.7969423755390044e-05, "loss": 0.5153, "step": 1409 }, { "epoch": 1.492063492063492, "grad_norm": 0.2224824662272724, "learning_rate": 2.7949823598588793e-05, "loss": 0.4091, "step": 1410 }, { "epoch": 1.493121693121693, "grad_norm": 0.23493481404403857, "learning_rate": 2.7930223441787535e-05, "loss": 0.4866, "step": 1411 }, { "epoch": 1.494179894179894, "grad_norm": 0.2833535564343193, "learning_rate": 2.791062328498628e-05, "loss": 0.5036, "step": 1412 }, { "epoch": 1.4952380952380953, "grad_norm": 0.2436710459859125, "learning_rate": 2.7891023128185027e-05, "loss": 0.4242, "step": 1413 }, { "epoch": 1.4962962962962962, "grad_norm": 0.2137011724525208, "learning_rate": 2.787142297138377e-05, "loss": 0.4408, "step": 1414 }, { "epoch": 1.4973544973544972, "grad_norm": 0.21652484554883622, "learning_rate": 2.785182281458252e-05, "loss": 0.3977, "step": 1415 }, { "epoch": 1.4984126984126984, "grad_norm": 0.22029526293438814, "learning_rate": 2.7832222657781265e-05, "loss": 0.4157, "step": 1416 }, { "epoch": 1.4994708994708994, "grad_norm": 1.4501431445796187, "learning_rate": 2.7812622500980007e-05, "loss": 0.5586, "step": 1417 }, { "epoch": 1.5005291005291004, "grad_norm": 0.22081871178721485, "learning_rate": 2.7793022344178753e-05, "loss": 0.388, "step": 1418 }, { "epoch": 1.5015873015873016, "grad_norm": 0.23512336150560495, "learning_rate": 2.7773422187377502e-05, "loss": 0.5023, "step": 1419 }, { "epoch": 1.5026455026455028, "grad_norm": 0.20454204650582183, "learning_rate": 2.7753822030576248e-05, "loss": 0.4074, "step": 1420 }, { "epoch": 1.5037037037037035, "grad_norm": 0.23959488063488388, "learning_rate": 2.773422187377499e-05, "loss": 0.408, "step": 1421 }, { "epoch": 1.5047619047619047, "grad_norm": 0.21797011158561788, "learning_rate": 2.7714621716973737e-05, "loss": 0.4108, "step": 1422 }, { "epoch": 1.505820105820106, "grad_norm": 0.2199227438643587, "learning_rate": 2.7695021560172486e-05, "loss": 0.4537, "step": 1423 }, { "epoch": 1.506878306878307, "grad_norm": 0.2197835414815512, "learning_rate": 2.7675421403371228e-05, "loss": 0.4258, "step": 1424 }, { "epoch": 1.507936507936508, "grad_norm": 0.2527966111155505, "learning_rate": 2.7655821246569974e-05, "loss": 0.5248, "step": 1425 }, { "epoch": 1.508994708994709, "grad_norm": 0.2261829434017835, "learning_rate": 2.763622108976872e-05, "loss": 0.4856, "step": 1426 }, { "epoch": 1.51005291005291, "grad_norm": 0.22385869251344861, "learning_rate": 2.7616620932967463e-05, "loss": 0.4547, "step": 1427 }, { "epoch": 1.511111111111111, "grad_norm": 0.23763869903387455, "learning_rate": 2.7597020776166212e-05, "loss": 0.4457, "step": 1428 }, { "epoch": 1.5121693121693123, "grad_norm": 0.24360801662833237, "learning_rate": 2.7577420619364958e-05, "loss": 0.453, "step": 1429 }, { "epoch": 1.5132275132275133, "grad_norm": 0.21514915559651115, "learning_rate": 2.75578204625637e-05, "loss": 0.4227, "step": 1430 }, { "epoch": 1.5142857142857142, "grad_norm": 0.25074559433170956, "learning_rate": 2.7538220305762446e-05, "loss": 0.4814, "step": 1431 }, { "epoch": 1.5153439153439154, "grad_norm": 0.2130270827244537, "learning_rate": 2.7518620148961195e-05, "loss": 0.4009, "step": 1432 }, { "epoch": 1.5164021164021164, "grad_norm": 0.2152015038995373, "learning_rate": 2.749901999215994e-05, "loss": 0.3733, "step": 1433 }, { "epoch": 1.5174603174603174, "grad_norm": 0.24155192005071274, "learning_rate": 2.7479419835358684e-05, "loss": 0.3632, "step": 1434 }, { "epoch": 1.5185185185185186, "grad_norm": 0.2503955177301381, "learning_rate": 2.745981967855743e-05, "loss": 0.3957, "step": 1435 }, { "epoch": 1.5195767195767196, "grad_norm": 0.21447061332958312, "learning_rate": 2.744021952175618e-05, "loss": 0.4501, "step": 1436 }, { "epoch": 1.5206349206349206, "grad_norm": 0.2142050063037971, "learning_rate": 2.7420619364954918e-05, "loss": 0.4045, "step": 1437 }, { "epoch": 1.5216931216931218, "grad_norm": 0.23469888425670518, "learning_rate": 2.7401019208153667e-05, "loss": 0.4069, "step": 1438 }, { "epoch": 1.5227513227513227, "grad_norm": 0.2373104219367434, "learning_rate": 2.7381419051352413e-05, "loss": 0.4439, "step": 1439 }, { "epoch": 1.5238095238095237, "grad_norm": 0.22931990044472228, "learning_rate": 2.7361818894551155e-05, "loss": 0.4105, "step": 1440 }, { "epoch": 1.524867724867725, "grad_norm": 0.24468342878432897, "learning_rate": 2.73422187377499e-05, "loss": 0.4859, "step": 1441 }, { "epoch": 1.525925925925926, "grad_norm": 0.23888600508604607, "learning_rate": 2.732261858094865e-05, "loss": 0.4692, "step": 1442 }, { "epoch": 1.5269841269841269, "grad_norm": 0.2379674355949921, "learning_rate": 2.7303018424147393e-05, "loss": 0.4565, "step": 1443 }, { "epoch": 1.528042328042328, "grad_norm": 0.21540467607096592, "learning_rate": 2.728341826734614e-05, "loss": 0.4183, "step": 1444 }, { "epoch": 1.529100529100529, "grad_norm": 0.2185049084742189, "learning_rate": 2.7263818110544885e-05, "loss": 0.4343, "step": 1445 }, { "epoch": 1.53015873015873, "grad_norm": 0.3364663153817796, "learning_rate": 2.7244217953743634e-05, "loss": 0.4324, "step": 1446 }, { "epoch": 1.5312169312169313, "grad_norm": 0.2053796079743368, "learning_rate": 2.7224617796942377e-05, "loss": 0.3719, "step": 1447 }, { "epoch": 1.5322751322751322, "grad_norm": 0.22693217543362984, "learning_rate": 2.7205017640141122e-05, "loss": 0.4356, "step": 1448 }, { "epoch": 1.5333333333333332, "grad_norm": 0.27177436391000964, "learning_rate": 2.718541748333987e-05, "loss": 0.5063, "step": 1449 }, { "epoch": 1.5343915343915344, "grad_norm": 0.2272754512480649, "learning_rate": 2.716581732653861e-05, "loss": 0.4487, "step": 1450 }, { "epoch": 1.5354497354497354, "grad_norm": 0.2410315063196938, "learning_rate": 2.714621716973736e-05, "loss": 0.4438, "step": 1451 }, { "epoch": 1.5365079365079364, "grad_norm": 0.25742696962818656, "learning_rate": 2.7126617012936106e-05, "loss": 0.4149, "step": 1452 }, { "epoch": 1.5375661375661376, "grad_norm": 0.21908883154703712, "learning_rate": 2.710701685613485e-05, "loss": 0.4035, "step": 1453 }, { "epoch": 1.5386243386243388, "grad_norm": 0.2720610233232513, "learning_rate": 2.7087416699333594e-05, "loss": 0.4956, "step": 1454 }, { "epoch": 1.5396825396825395, "grad_norm": 0.25478178075039626, "learning_rate": 2.7067816542532344e-05, "loss": 0.4651, "step": 1455 }, { "epoch": 1.5407407407407407, "grad_norm": 0.24946076321329155, "learning_rate": 2.7048216385731083e-05, "loss": 0.4157, "step": 1456 }, { "epoch": 1.541798941798942, "grad_norm": 0.22111397690870904, "learning_rate": 2.7028616228929832e-05, "loss": 0.4587, "step": 1457 }, { "epoch": 1.5428571428571427, "grad_norm": 0.2324140499446646, "learning_rate": 2.7009016072128578e-05, "loss": 0.405, "step": 1458 }, { "epoch": 1.543915343915344, "grad_norm": 0.22732415750667306, "learning_rate": 2.6989415915327327e-05, "loss": 0.4055, "step": 1459 }, { "epoch": 1.544973544973545, "grad_norm": 0.24012853054594216, "learning_rate": 2.696981575852607e-05, "loss": 0.4123, "step": 1460 }, { "epoch": 1.5460317460317459, "grad_norm": 0.23424926428598633, "learning_rate": 2.6950215601724815e-05, "loss": 0.4752, "step": 1461 }, { "epoch": 1.547089947089947, "grad_norm": 0.21660322408001612, "learning_rate": 2.693061544492356e-05, "loss": 0.4178, "step": 1462 }, { "epoch": 1.5481481481481483, "grad_norm": 0.24724861696989267, "learning_rate": 2.6911015288122304e-05, "loss": 0.4814, "step": 1463 }, { "epoch": 1.5492063492063493, "grad_norm": 2.4202839601143395, "learning_rate": 2.6891415131321053e-05, "loss": 0.4914, "step": 1464 }, { "epoch": 1.5502645502645502, "grad_norm": 0.2730981950672889, "learning_rate": 2.68718149745198e-05, "loss": 0.4757, "step": 1465 }, { "epoch": 1.5513227513227514, "grad_norm": 0.25371740302385765, "learning_rate": 2.685221481771854e-05, "loss": 0.4303, "step": 1466 }, { "epoch": 1.5523809523809524, "grad_norm": 0.23514106327423817, "learning_rate": 2.6832614660917287e-05, "loss": 0.4854, "step": 1467 }, { "epoch": 1.5534391534391534, "grad_norm": 0.2708610524635199, "learning_rate": 2.6813014504116037e-05, "loss": 0.4422, "step": 1468 }, { "epoch": 1.5544973544973546, "grad_norm": 0.30542783481347546, "learning_rate": 2.6793414347314782e-05, "loss": 0.528, "step": 1469 }, { "epoch": 1.5555555555555556, "grad_norm": 0.2966857021941013, "learning_rate": 2.6773814190513525e-05, "loss": 0.5021, "step": 1470 }, { "epoch": 1.5566137566137566, "grad_norm": 0.2860444368837978, "learning_rate": 2.675421403371227e-05, "loss": 0.4251, "step": 1471 }, { "epoch": 1.5576719576719578, "grad_norm": 0.28665886400785867, "learning_rate": 2.673461387691102e-05, "loss": 0.4435, "step": 1472 }, { "epoch": 1.5587301587301587, "grad_norm": 0.2500775008406684, "learning_rate": 2.671501372010976e-05, "loss": 0.4658, "step": 1473 }, { "epoch": 1.5597883597883597, "grad_norm": 0.22253440194972182, "learning_rate": 2.669541356330851e-05, "loss": 0.4646, "step": 1474 }, { "epoch": 1.560846560846561, "grad_norm": 0.2146340288566786, "learning_rate": 2.6675813406507254e-05, "loss": 0.3825, "step": 1475 }, { "epoch": 1.561904761904762, "grad_norm": 0.25064823580585577, "learning_rate": 2.6656213249705997e-05, "loss": 0.4584, "step": 1476 }, { "epoch": 1.5629629629629629, "grad_norm": 0.2134463383511302, "learning_rate": 2.6636613092904743e-05, "loss": 0.3789, "step": 1477 }, { "epoch": 1.564021164021164, "grad_norm": 0.2151371289165454, "learning_rate": 2.6617012936103492e-05, "loss": 0.4082, "step": 1478 }, { "epoch": 1.565079365079365, "grad_norm": 0.211315815255196, "learning_rate": 2.6597412779302234e-05, "loss": 0.4444, "step": 1479 }, { "epoch": 1.566137566137566, "grad_norm": 0.2638512019499202, "learning_rate": 2.657781262250098e-05, "loss": 0.4507, "step": 1480 }, { "epoch": 1.5671957671957673, "grad_norm": 0.2249085194794952, "learning_rate": 2.6558212465699726e-05, "loss": 0.4262, "step": 1481 }, { "epoch": 1.5682539682539682, "grad_norm": 0.22540466561989844, "learning_rate": 2.6538612308898475e-05, "loss": 0.4295, "step": 1482 }, { "epoch": 1.5693121693121692, "grad_norm": 0.30665171807723735, "learning_rate": 2.6519012152097218e-05, "loss": 0.4453, "step": 1483 }, { "epoch": 1.5703703703703704, "grad_norm": 0.22641322915874576, "learning_rate": 2.6499411995295964e-05, "loss": 0.4591, "step": 1484 }, { "epoch": 1.5714285714285714, "grad_norm": 0.23344135403479024, "learning_rate": 2.647981183849471e-05, "loss": 0.4637, "step": 1485 }, { "epoch": 1.5724867724867724, "grad_norm": 0.22588214050919211, "learning_rate": 2.6460211681693452e-05, "loss": 0.4211, "step": 1486 }, { "epoch": 1.5735449735449736, "grad_norm": 0.21399920852193247, "learning_rate": 2.64406115248922e-05, "loss": 0.411, "step": 1487 }, { "epoch": 1.5746031746031746, "grad_norm": 0.23919553348000056, "learning_rate": 2.6421011368090947e-05, "loss": 0.4556, "step": 1488 }, { "epoch": 1.5756613756613755, "grad_norm": 0.2540160136398132, "learning_rate": 2.640141121128969e-05, "loss": 0.4276, "step": 1489 }, { "epoch": 1.5767195767195767, "grad_norm": 0.21268692149914978, "learning_rate": 2.6381811054488436e-05, "loss": 0.4493, "step": 1490 }, { "epoch": 1.5777777777777777, "grad_norm": 0.2504004750600532, "learning_rate": 2.6362210897687185e-05, "loss": 0.4475, "step": 1491 }, { "epoch": 1.5788359788359787, "grad_norm": 0.23627610401779106, "learning_rate": 2.6342610740885927e-05, "loss": 0.4387, "step": 1492 }, { "epoch": 1.57989417989418, "grad_norm": 0.24142802272346445, "learning_rate": 2.6323010584084673e-05, "loss": 0.4786, "step": 1493 }, { "epoch": 1.580952380952381, "grad_norm": 0.2175227164819747, "learning_rate": 2.630341042728342e-05, "loss": 0.3726, "step": 1494 }, { "epoch": 1.5820105820105819, "grad_norm": 0.2753522156824328, "learning_rate": 2.628381027048217e-05, "loss": 0.476, "step": 1495 }, { "epoch": 1.583068783068783, "grad_norm": 0.23691656483032675, "learning_rate": 2.626421011368091e-05, "loss": 0.4573, "step": 1496 }, { "epoch": 1.5841269841269843, "grad_norm": 0.2710431505452728, "learning_rate": 2.6244609956879657e-05, "loss": 0.4574, "step": 1497 }, { "epoch": 1.585185185185185, "grad_norm": 0.23025660214601665, "learning_rate": 2.6225009800078403e-05, "loss": 0.4509, "step": 1498 }, { "epoch": 1.5862433862433862, "grad_norm": 0.23275216490060904, "learning_rate": 2.6205409643277145e-05, "loss": 0.4509, "step": 1499 }, { "epoch": 1.5873015873015874, "grad_norm": 0.2515703737282682, "learning_rate": 2.6185809486475894e-05, "loss": 0.418, "step": 1500 }, { "epoch": 1.5883597883597882, "grad_norm": 0.22937595837034908, "learning_rate": 2.616620932967464e-05, "loss": 0.4749, "step": 1501 }, { "epoch": 1.5894179894179894, "grad_norm": 0.22662452164402827, "learning_rate": 2.6146609172873383e-05, "loss": 0.431, "step": 1502 }, { "epoch": 1.5904761904761906, "grad_norm": 0.24520042229444927, "learning_rate": 2.612700901607213e-05, "loss": 0.4334, "step": 1503 }, { "epoch": 1.5915343915343916, "grad_norm": 0.21389765221210016, "learning_rate": 2.6107408859270878e-05, "loss": 0.3985, "step": 1504 }, { "epoch": 1.5925925925925926, "grad_norm": 0.2520836403046001, "learning_rate": 2.6087808702469617e-05, "loss": 0.4952, "step": 1505 }, { "epoch": 1.5936507936507938, "grad_norm": 0.2590281741764871, "learning_rate": 2.6068208545668366e-05, "loss": 0.4153, "step": 1506 }, { "epoch": 1.5947089947089947, "grad_norm": 0.24382084508627283, "learning_rate": 2.6048608388867112e-05, "loss": 0.428, "step": 1507 }, { "epoch": 1.5957671957671957, "grad_norm": 0.2738433714268017, "learning_rate": 2.602900823206586e-05, "loss": 0.5297, "step": 1508 }, { "epoch": 1.596825396825397, "grad_norm": 0.24412194302796253, "learning_rate": 2.60094080752646e-05, "loss": 0.4653, "step": 1509 }, { "epoch": 1.597883597883598, "grad_norm": 0.24762704362472204, "learning_rate": 2.598980791846335e-05, "loss": 0.4746, "step": 1510 }, { "epoch": 1.5989417989417989, "grad_norm": 0.21515171581042802, "learning_rate": 2.5970207761662096e-05, "loss": 0.4151, "step": 1511 }, { "epoch": 1.6, "grad_norm": 0.2500799916715579, "learning_rate": 2.5950607604860838e-05, "loss": 0.417, "step": 1512 }, { "epoch": 1.601058201058201, "grad_norm": 0.2381688667695587, "learning_rate": 2.5931007448059584e-05, "loss": 0.4448, "step": 1513 }, { "epoch": 1.602116402116402, "grad_norm": 0.24191214514605877, "learning_rate": 2.5911407291258333e-05, "loss": 0.4683, "step": 1514 }, { "epoch": 1.6031746031746033, "grad_norm": 0.23784184704388528, "learning_rate": 2.5891807134457076e-05, "loss": 0.4265, "step": 1515 }, { "epoch": 1.6042328042328042, "grad_norm": 0.2734834110756681, "learning_rate": 2.587220697765582e-05, "loss": 0.439, "step": 1516 }, { "epoch": 1.6052910052910052, "grad_norm": 0.25396515727811014, "learning_rate": 2.5852606820854568e-05, "loss": 0.461, "step": 1517 }, { "epoch": 1.6063492063492064, "grad_norm": 0.242809898846282, "learning_rate": 2.5833006664053317e-05, "loss": 0.4573, "step": 1518 }, { "epoch": 1.6074074074074074, "grad_norm": 0.24330748961980134, "learning_rate": 2.581340650725206e-05, "loss": 0.4954, "step": 1519 }, { "epoch": 1.6084656084656084, "grad_norm": 0.27485852795834315, "learning_rate": 2.5793806350450805e-05, "loss": 0.428, "step": 1520 }, { "epoch": 1.6095238095238096, "grad_norm": 0.2445502116760993, "learning_rate": 2.577420619364955e-05, "loss": 0.4789, "step": 1521 }, { "epoch": 1.6105820105820106, "grad_norm": 0.20220337369742722, "learning_rate": 2.5754606036848293e-05, "loss": 0.4299, "step": 1522 }, { "epoch": 1.6116402116402115, "grad_norm": 0.2587743856048611, "learning_rate": 2.5735005880047043e-05, "loss": 0.4562, "step": 1523 }, { "epoch": 1.6126984126984127, "grad_norm": 0.24738675168343857, "learning_rate": 2.571540572324579e-05, "loss": 0.4662, "step": 1524 }, { "epoch": 1.6137566137566137, "grad_norm": 0.24385074926826564, "learning_rate": 2.569580556644453e-05, "loss": 0.5195, "step": 1525 }, { "epoch": 1.6148148148148147, "grad_norm": 0.23106098933576363, "learning_rate": 2.5676205409643277e-05, "loss": 0.4307, "step": 1526 }, { "epoch": 1.615873015873016, "grad_norm": 0.2717986108900208, "learning_rate": 2.5656605252842026e-05, "loss": 0.4243, "step": 1527 }, { "epoch": 1.6169312169312169, "grad_norm": 0.23631217436069807, "learning_rate": 2.563700509604077e-05, "loss": 0.4001, "step": 1528 }, { "epoch": 1.6179894179894179, "grad_norm": 0.21316396332645726, "learning_rate": 2.5617404939239515e-05, "loss": 0.4231, "step": 1529 }, { "epoch": 1.619047619047619, "grad_norm": 0.2529199744670375, "learning_rate": 2.559780478243826e-05, "loss": 0.4383, "step": 1530 }, { "epoch": 1.6201058201058203, "grad_norm": 0.25696462723939195, "learning_rate": 2.557820462563701e-05, "loss": 0.4237, "step": 1531 }, { "epoch": 1.621164021164021, "grad_norm": 0.21751147659743908, "learning_rate": 2.5558604468835752e-05, "loss": 0.4341, "step": 1532 }, { "epoch": 1.6222222222222222, "grad_norm": 0.23705868496582927, "learning_rate": 2.5539004312034498e-05, "loss": 0.4627, "step": 1533 }, { "epoch": 1.6232804232804234, "grad_norm": 0.228268665473131, "learning_rate": 2.5519404155233244e-05, "loss": 0.4752, "step": 1534 }, { "epoch": 1.6243386243386242, "grad_norm": 0.22482394544205297, "learning_rate": 2.5499803998431986e-05, "loss": 0.3858, "step": 1535 }, { "epoch": 1.6253968253968254, "grad_norm": 0.22061168915586668, "learning_rate": 2.5480203841630736e-05, "loss": 0.4521, "step": 1536 }, { "epoch": 1.6264550264550266, "grad_norm": 0.2187772213768095, "learning_rate": 2.546060368482948e-05, "loss": 0.4239, "step": 1537 }, { "epoch": 1.6275132275132274, "grad_norm": 0.24727489561841617, "learning_rate": 2.5441003528028224e-05, "loss": 0.4374, "step": 1538 }, { "epoch": 1.6285714285714286, "grad_norm": 0.22113247609707468, "learning_rate": 2.542140337122697e-05, "loss": 0.4134, "step": 1539 }, { "epoch": 1.6296296296296298, "grad_norm": 0.19784291165488221, "learning_rate": 2.540180321442572e-05, "loss": 0.4032, "step": 1540 }, { "epoch": 1.6306878306878307, "grad_norm": 0.2619577804116681, "learning_rate": 2.538220305762446e-05, "loss": 0.4637, "step": 1541 }, { "epoch": 1.6317460317460317, "grad_norm": 0.22538233425794543, "learning_rate": 2.5362602900823208e-05, "loss": 0.4441, "step": 1542 }, { "epoch": 1.632804232804233, "grad_norm": 0.24391278793008334, "learning_rate": 2.5343002744021953e-05, "loss": 0.4614, "step": 1543 }, { "epoch": 1.633862433862434, "grad_norm": 0.20585304887187394, "learning_rate": 2.5323402587220703e-05, "loss": 0.4181, "step": 1544 }, { "epoch": 1.6349206349206349, "grad_norm": 0.22818882395320167, "learning_rate": 2.5303802430419442e-05, "loss": 0.4149, "step": 1545 }, { "epoch": 1.635978835978836, "grad_norm": 0.2350090079830974, "learning_rate": 2.528420227361819e-05, "loss": 0.4735, "step": 1546 }, { "epoch": 1.637037037037037, "grad_norm": 0.22082978235289127, "learning_rate": 2.5264602116816937e-05, "loss": 0.4501, "step": 1547 }, { "epoch": 1.638095238095238, "grad_norm": 0.21032695498106663, "learning_rate": 2.524500196001568e-05, "loss": 0.4295, "step": 1548 }, { "epoch": 1.6391534391534393, "grad_norm": 0.21324437007116495, "learning_rate": 2.5225401803214425e-05, "loss": 0.3839, "step": 1549 }, { "epoch": 1.6402116402116402, "grad_norm": 0.2389234288277766, "learning_rate": 2.5205801646413175e-05, "loss": 0.4673, "step": 1550 }, { "epoch": 1.6412698412698412, "grad_norm": 0.20583896765782994, "learning_rate": 2.5186201489611917e-05, "loss": 0.4092, "step": 1551 }, { "epoch": 1.6423280423280424, "grad_norm": 0.22499957708636312, "learning_rate": 2.5166601332810663e-05, "loss": 0.4721, "step": 1552 }, { "epoch": 1.6433862433862434, "grad_norm": 0.8465391318181628, "learning_rate": 2.514700117600941e-05, "loss": 0.4525, "step": 1553 }, { "epoch": 1.6444444444444444, "grad_norm": 0.2196056190105969, "learning_rate": 2.512740101920815e-05, "loss": 0.3989, "step": 1554 }, { "epoch": 1.6455026455026456, "grad_norm": 0.2638083915277794, "learning_rate": 2.51078008624069e-05, "loss": 0.4352, "step": 1555 }, { "epoch": 1.6465608465608466, "grad_norm": 0.25435791435225086, "learning_rate": 2.5088200705605646e-05, "loss": 0.4399, "step": 1556 }, { "epoch": 1.6476190476190475, "grad_norm": 0.23162932695663496, "learning_rate": 2.5068600548804392e-05, "loss": 0.4378, "step": 1557 }, { "epoch": 1.6486772486772487, "grad_norm": 0.24325491659040138, "learning_rate": 2.5049000392003135e-05, "loss": 0.4726, "step": 1558 }, { "epoch": 1.6497354497354497, "grad_norm": 0.2692936406441785, "learning_rate": 2.5029400235201884e-05, "loss": 0.503, "step": 1559 }, { "epoch": 1.6507936507936507, "grad_norm": 0.23044637601493173, "learning_rate": 2.500980007840063e-05, "loss": 0.4153, "step": 1560 }, { "epoch": 1.651851851851852, "grad_norm": 0.19978472969279348, "learning_rate": 2.4990199921599376e-05, "loss": 0.4189, "step": 1561 }, { "epoch": 1.6529100529100529, "grad_norm": 2.0130289348317545, "learning_rate": 2.497059976479812e-05, "loss": 0.5285, "step": 1562 }, { "epoch": 1.6539682539682539, "grad_norm": 0.6382100207364493, "learning_rate": 2.4950999607996864e-05, "loss": 0.498, "step": 1563 }, { "epoch": 1.655026455026455, "grad_norm": 0.2873980573844794, "learning_rate": 2.493139945119561e-05, "loss": 0.4345, "step": 1564 }, { "epoch": 1.656084656084656, "grad_norm": 0.2238132320590369, "learning_rate": 2.4911799294394356e-05, "loss": 0.4107, "step": 1565 }, { "epoch": 1.657142857142857, "grad_norm": 0.22847079158253494, "learning_rate": 2.4892199137593102e-05, "loss": 0.4377, "step": 1566 }, { "epoch": 1.6582010582010582, "grad_norm": 0.27256045565172915, "learning_rate": 2.4872598980791848e-05, "loss": 0.4313, "step": 1567 }, { "epoch": 1.6592592592592592, "grad_norm": 0.2586293011878613, "learning_rate": 2.4852998823990594e-05, "loss": 0.4437, "step": 1568 }, { "epoch": 1.6603174603174602, "grad_norm": 0.22800723424295755, "learning_rate": 2.483339866718934e-05, "loss": 0.4273, "step": 1569 }, { "epoch": 1.6613756613756614, "grad_norm": 0.24485218448636267, "learning_rate": 2.4813798510388085e-05, "loss": 0.4393, "step": 1570 }, { "epoch": 1.6624338624338626, "grad_norm": 0.2484620989572041, "learning_rate": 2.479419835358683e-05, "loss": 0.3975, "step": 1571 }, { "epoch": 1.6634920634920634, "grad_norm": 0.2124965525509564, "learning_rate": 2.4774598196785577e-05, "loss": 0.4508, "step": 1572 }, { "epoch": 1.6645502645502646, "grad_norm": 0.24457740342964363, "learning_rate": 2.475499803998432e-05, "loss": 0.4299, "step": 1573 }, { "epoch": 1.6656084656084658, "grad_norm": 0.25001350248928084, "learning_rate": 2.473539788318307e-05, "loss": 0.4924, "step": 1574 }, { "epoch": 1.6666666666666665, "grad_norm": 0.2307802816899225, "learning_rate": 2.471579772638181e-05, "loss": 0.4466, "step": 1575 }, { "epoch": 1.6677248677248677, "grad_norm": 0.22210528313961542, "learning_rate": 2.469619756958056e-05, "loss": 0.404, "step": 1576 }, { "epoch": 1.668783068783069, "grad_norm": 0.21443376857710403, "learning_rate": 2.4676597412779303e-05, "loss": 0.4274, "step": 1577 }, { "epoch": 1.6698412698412697, "grad_norm": 0.22624353684944185, "learning_rate": 2.465699725597805e-05, "loss": 0.436, "step": 1578 }, { "epoch": 1.6708994708994709, "grad_norm": 0.23099050262288628, "learning_rate": 2.4637397099176795e-05, "loss": 0.4582, "step": 1579 }, { "epoch": 1.671957671957672, "grad_norm": 0.20700052537048644, "learning_rate": 2.461779694237554e-05, "loss": 0.4105, "step": 1580 }, { "epoch": 1.673015873015873, "grad_norm": 0.21408619017989325, "learning_rate": 2.4598196785574283e-05, "loss": 0.4435, "step": 1581 }, { "epoch": 1.674074074074074, "grad_norm": 0.22932373910076814, "learning_rate": 2.4578596628773032e-05, "loss": 0.4423, "step": 1582 }, { "epoch": 1.6751322751322753, "grad_norm": 0.2701302455070178, "learning_rate": 2.4558996471971775e-05, "loss": 0.491, "step": 1583 }, { "epoch": 1.6761904761904762, "grad_norm": 0.22544337265587341, "learning_rate": 2.4539396315170524e-05, "loss": 0.4743, "step": 1584 }, { "epoch": 1.6772486772486772, "grad_norm": 0.21709705988111216, "learning_rate": 2.4519796158369267e-05, "loss": 0.4065, "step": 1585 }, { "epoch": 1.6783068783068784, "grad_norm": 0.24996850315949026, "learning_rate": 2.4500196001568013e-05, "loss": 0.4375, "step": 1586 }, { "epoch": 1.6793650793650794, "grad_norm": 0.9469921058616022, "learning_rate": 2.448059584476676e-05, "loss": 0.4191, "step": 1587 }, { "epoch": 1.6804232804232804, "grad_norm": 0.22085668242729445, "learning_rate": 2.4460995687965504e-05, "loss": 0.48, "step": 1588 }, { "epoch": 1.6814814814814816, "grad_norm": 0.23494830806105532, "learning_rate": 2.444139553116425e-05, "loss": 0.4781, "step": 1589 }, { "epoch": 1.6825396825396826, "grad_norm": 0.26542532348970826, "learning_rate": 2.4421795374362996e-05, "loss": 0.4089, "step": 1590 }, { "epoch": 1.6835978835978835, "grad_norm": 0.25254211616363226, "learning_rate": 2.4402195217561742e-05, "loss": 0.4629, "step": 1591 }, { "epoch": 1.6846560846560847, "grad_norm": 0.2273568902013295, "learning_rate": 2.4382595060760488e-05, "loss": 0.4592, "step": 1592 }, { "epoch": 1.6857142857142857, "grad_norm": 0.2594410680121094, "learning_rate": 2.4362994903959234e-05, "loss": 0.439, "step": 1593 }, { "epoch": 1.6867724867724867, "grad_norm": 0.2274722145506129, "learning_rate": 2.4343394747157976e-05, "loss": 0.4246, "step": 1594 }, { "epoch": 1.687830687830688, "grad_norm": 0.2400296192105033, "learning_rate": 2.4323794590356725e-05, "loss": 0.4985, "step": 1595 }, { "epoch": 1.6888888888888889, "grad_norm": 0.2335755377011187, "learning_rate": 2.4304194433555468e-05, "loss": 0.5176, "step": 1596 }, { "epoch": 1.6899470899470899, "grad_norm": 0.22541799468462043, "learning_rate": 2.4284594276754217e-05, "loss": 0.391, "step": 1597 }, { "epoch": 1.691005291005291, "grad_norm": 0.25348107176315676, "learning_rate": 2.426499411995296e-05, "loss": 0.4718, "step": 1598 }, { "epoch": 1.692063492063492, "grad_norm": 0.2134398598463444, "learning_rate": 2.4245393963151706e-05, "loss": 0.4563, "step": 1599 }, { "epoch": 1.693121693121693, "grad_norm": 0.2799436267035962, "learning_rate": 2.422579380635045e-05, "loss": 0.4554, "step": 1600 }, { "epoch": 1.6941798941798942, "grad_norm": 0.2239050910441869, "learning_rate": 2.4206193649549197e-05, "loss": 0.4303, "step": 1601 }, { "epoch": 1.6952380952380952, "grad_norm": 5.342628537787954, "learning_rate": 2.4186593492747943e-05, "loss": 0.4961, "step": 1602 }, { "epoch": 1.6962962962962962, "grad_norm": 1.9189218328198483, "learning_rate": 2.416699333594669e-05, "loss": 0.5465, "step": 1603 }, { "epoch": 1.6973544973544974, "grad_norm": 0.22908151431119422, "learning_rate": 2.4147393179145435e-05, "loss": 0.4628, "step": 1604 }, { "epoch": 1.6984126984126984, "grad_norm": 0.20485996984971452, "learning_rate": 2.412779302234418e-05, "loss": 0.4013, "step": 1605 }, { "epoch": 1.6994708994708994, "grad_norm": 0.21384044603948377, "learning_rate": 2.4108192865542927e-05, "loss": 0.4111, "step": 1606 }, { "epoch": 1.7005291005291006, "grad_norm": 0.20944814894771038, "learning_rate": 2.4088592708741673e-05, "loss": 0.4487, "step": 1607 }, { "epoch": 1.7015873015873015, "grad_norm": 0.21567540070670468, "learning_rate": 2.406899255194042e-05, "loss": 0.4157, "step": 1608 }, { "epoch": 1.7026455026455025, "grad_norm": 0.21951670349669852, "learning_rate": 2.404939239513916e-05, "loss": 0.3811, "step": 1609 }, { "epoch": 1.7037037037037037, "grad_norm": 0.21975151815133592, "learning_rate": 2.402979223833791e-05, "loss": 0.4788, "step": 1610 }, { "epoch": 1.704761904761905, "grad_norm": 0.20831580759962212, "learning_rate": 2.4010192081536653e-05, "loss": 0.4237, "step": 1611 }, { "epoch": 1.7058201058201057, "grad_norm": 0.20479233675329178, "learning_rate": 2.39905919247354e-05, "loss": 0.4125, "step": 1612 }, { "epoch": 1.7068783068783069, "grad_norm": 0.21043974195059237, "learning_rate": 2.3970991767934144e-05, "loss": 0.4114, "step": 1613 }, { "epoch": 1.707936507936508, "grad_norm": 0.19363060033043297, "learning_rate": 2.395139161113289e-05, "loss": 0.3958, "step": 1614 }, { "epoch": 1.7089947089947088, "grad_norm": 0.2203472309140231, "learning_rate": 2.3931791454331636e-05, "loss": 0.4136, "step": 1615 }, { "epoch": 1.71005291005291, "grad_norm": 0.27731224351626677, "learning_rate": 2.3912191297530382e-05, "loss": 0.428, "step": 1616 }, { "epoch": 1.7111111111111112, "grad_norm": 0.2165312271941672, "learning_rate": 2.3892591140729124e-05, "loss": 0.4455, "step": 1617 }, { "epoch": 1.712169312169312, "grad_norm": 0.21755495769093408, "learning_rate": 2.3872990983927874e-05, "loss": 0.4031, "step": 1618 }, { "epoch": 1.7132275132275132, "grad_norm": 0.24222900168163267, "learning_rate": 2.3853390827126616e-05, "loss": 0.4786, "step": 1619 }, { "epoch": 1.7142857142857144, "grad_norm": 0.23418054019764165, "learning_rate": 2.3833790670325365e-05, "loss": 0.4608, "step": 1620 }, { "epoch": 1.7153439153439154, "grad_norm": 0.21694499784225765, "learning_rate": 2.3814190513524108e-05, "loss": 0.4283, "step": 1621 }, { "epoch": 1.7164021164021164, "grad_norm": 0.2546334351383069, "learning_rate": 2.3794590356722854e-05, "loss": 0.4606, "step": 1622 }, { "epoch": 1.7174603174603176, "grad_norm": 0.2319584310490375, "learning_rate": 2.37749901999216e-05, "loss": 0.451, "step": 1623 }, { "epoch": 1.7185185185185186, "grad_norm": 0.24477601621399064, "learning_rate": 2.3755390043120346e-05, "loss": 0.5059, "step": 1624 }, { "epoch": 1.7195767195767195, "grad_norm": 0.36838675048518105, "learning_rate": 2.373578988631909e-05, "loss": 0.4285, "step": 1625 }, { "epoch": 1.7206349206349207, "grad_norm": 0.2466732852047628, "learning_rate": 2.3716189729517837e-05, "loss": 0.4437, "step": 1626 }, { "epoch": 1.7216931216931217, "grad_norm": 0.2135957001631125, "learning_rate": 2.3696589572716583e-05, "loss": 0.421, "step": 1627 }, { "epoch": 1.7227513227513227, "grad_norm": 0.24026169452843113, "learning_rate": 2.367698941591533e-05, "loss": 0.4532, "step": 1628 }, { "epoch": 1.723809523809524, "grad_norm": 0.22254374311366967, "learning_rate": 2.3657389259114075e-05, "loss": 0.4547, "step": 1629 }, { "epoch": 1.7248677248677249, "grad_norm": 0.2379541965307219, "learning_rate": 2.3637789102312817e-05, "loss": 0.4735, "step": 1630 }, { "epoch": 1.7259259259259259, "grad_norm": 0.21785973896658944, "learning_rate": 2.3618188945511567e-05, "loss": 0.4757, "step": 1631 }, { "epoch": 1.726984126984127, "grad_norm": 0.20413785133119947, "learning_rate": 2.359858878871031e-05, "loss": 0.4213, "step": 1632 }, { "epoch": 1.728042328042328, "grad_norm": 0.25684940141007967, "learning_rate": 2.357898863190906e-05, "loss": 0.4839, "step": 1633 }, { "epoch": 1.729100529100529, "grad_norm": 0.22575583816002107, "learning_rate": 2.35593884751078e-05, "loss": 0.4319, "step": 1634 }, { "epoch": 1.7301587301587302, "grad_norm": 0.2102700377358579, "learning_rate": 2.3539788318306547e-05, "loss": 0.4465, "step": 1635 }, { "epoch": 1.7312169312169312, "grad_norm": 0.23871894363562643, "learning_rate": 2.3520188161505293e-05, "loss": 0.4692, "step": 1636 }, { "epoch": 1.7322751322751322, "grad_norm": 0.2257335899512697, "learning_rate": 2.350058800470404e-05, "loss": 0.4308, "step": 1637 }, { "epoch": 1.7333333333333334, "grad_norm": 0.21199670925667166, "learning_rate": 2.3480987847902784e-05, "loss": 0.4399, "step": 1638 }, { "epoch": 1.7343915343915344, "grad_norm": 0.2637943802055583, "learning_rate": 2.346138769110153e-05, "loss": 0.469, "step": 1639 }, { "epoch": 1.7354497354497354, "grad_norm": 0.2091292729089544, "learning_rate": 2.3441787534300276e-05, "loss": 0.4079, "step": 1640 }, { "epoch": 1.7365079365079366, "grad_norm": 0.21242035801784248, "learning_rate": 2.3422187377499022e-05, "loss": 0.433, "step": 1641 }, { "epoch": 1.7375661375661375, "grad_norm": 0.21411367696110664, "learning_rate": 2.3402587220697768e-05, "loss": 0.445, "step": 1642 }, { "epoch": 1.7386243386243385, "grad_norm": 0.22666171986236078, "learning_rate": 2.338298706389651e-05, "loss": 0.4352, "step": 1643 }, { "epoch": 1.7396825396825397, "grad_norm": 0.2154746362675361, "learning_rate": 2.336338690709526e-05, "loss": 0.4225, "step": 1644 }, { "epoch": 1.7407407407407407, "grad_norm": 0.20835776969269476, "learning_rate": 2.3343786750294002e-05, "loss": 0.423, "step": 1645 }, { "epoch": 1.7417989417989417, "grad_norm": 0.22888559147228213, "learning_rate": 2.332418659349275e-05, "loss": 0.4047, "step": 1646 }, { "epoch": 1.7428571428571429, "grad_norm": 0.2211280651344821, "learning_rate": 2.3304586436691494e-05, "loss": 0.3904, "step": 1647 }, { "epoch": 1.7439153439153439, "grad_norm": 0.23013001218859871, "learning_rate": 2.328498627989024e-05, "loss": 0.4983, "step": 1648 }, { "epoch": 1.7449735449735448, "grad_norm": 0.2202927426992636, "learning_rate": 2.3265386123088986e-05, "loss": 0.4676, "step": 1649 }, { "epoch": 1.746031746031746, "grad_norm": 0.22953761733384298, "learning_rate": 2.324578596628773e-05, "loss": 0.45, "step": 1650 }, { "epoch": 1.7470899470899472, "grad_norm": 0.22282806910302103, "learning_rate": 2.3226185809486477e-05, "loss": 0.4218, "step": 1651 }, { "epoch": 1.748148148148148, "grad_norm": 0.23634054688203343, "learning_rate": 2.3206585652685223e-05, "loss": 0.464, "step": 1652 }, { "epoch": 1.7492063492063492, "grad_norm": 0.22465725068012996, "learning_rate": 2.3186985495883966e-05, "loss": 0.448, "step": 1653 }, { "epoch": 1.7502645502645504, "grad_norm": 0.23369312668049108, "learning_rate": 2.3167385339082715e-05, "loss": 0.4475, "step": 1654 }, { "epoch": 1.7513227513227512, "grad_norm": 0.21668131882841285, "learning_rate": 2.3147785182281458e-05, "loss": 0.4246, "step": 1655 }, { "epoch": 1.7523809523809524, "grad_norm": 0.22432886200596663, "learning_rate": 2.3128185025480207e-05, "loss": 0.4726, "step": 1656 }, { "epoch": 1.7534391534391536, "grad_norm": 0.23217067772618433, "learning_rate": 2.310858486867895e-05, "loss": 0.4217, "step": 1657 }, { "epoch": 1.7544973544973543, "grad_norm": 0.23998057651431876, "learning_rate": 2.3088984711877695e-05, "loss": 0.4478, "step": 1658 }, { "epoch": 1.7555555555555555, "grad_norm": 0.20789843322357746, "learning_rate": 2.306938455507644e-05, "loss": 0.4093, "step": 1659 }, { "epoch": 1.7566137566137567, "grad_norm": 0.18321420188469675, "learning_rate": 2.3049784398275187e-05, "loss": 0.3687, "step": 1660 }, { "epoch": 1.7576719576719577, "grad_norm": 0.24493094649795308, "learning_rate": 2.3030184241473933e-05, "loss": 0.4494, "step": 1661 }, { "epoch": 1.7587301587301587, "grad_norm": 0.2091468538339814, "learning_rate": 2.301058408467268e-05, "loss": 0.4199, "step": 1662 }, { "epoch": 1.75978835978836, "grad_norm": 0.2071184685848974, "learning_rate": 2.2990983927871425e-05, "loss": 0.4324, "step": 1663 }, { "epoch": 1.7608465608465609, "grad_norm": 0.24913461114215388, "learning_rate": 2.297138377107017e-05, "loss": 0.467, "step": 1664 }, { "epoch": 1.7619047619047619, "grad_norm": 0.2391544344366436, "learning_rate": 2.2951783614268916e-05, "loss": 0.4312, "step": 1665 }, { "epoch": 1.762962962962963, "grad_norm": 0.22154350725206487, "learning_rate": 2.293218345746766e-05, "loss": 0.4597, "step": 1666 }, { "epoch": 1.764021164021164, "grad_norm": 0.24501309931712809, "learning_rate": 2.2912583300666408e-05, "loss": 0.459, "step": 1667 }, { "epoch": 1.765079365079365, "grad_norm": 0.25526129904002803, "learning_rate": 2.289298314386515e-05, "loss": 0.4835, "step": 1668 }, { "epoch": 1.7661375661375662, "grad_norm": 0.21646297720766336, "learning_rate": 2.28733829870639e-05, "loss": 0.4335, "step": 1669 }, { "epoch": 1.7671957671957672, "grad_norm": 0.22733884215028632, "learning_rate": 2.2853782830262642e-05, "loss": 0.4552, "step": 1670 }, { "epoch": 1.7682539682539682, "grad_norm": 0.2075067401096603, "learning_rate": 2.2834182673461388e-05, "loss": 0.4138, "step": 1671 }, { "epoch": 1.7693121693121694, "grad_norm": 0.2257968183277266, "learning_rate": 2.2814582516660134e-05, "loss": 0.5253, "step": 1672 }, { "epoch": 1.7703703703703704, "grad_norm": 0.21533345478212504, "learning_rate": 2.279498235985888e-05, "loss": 0.4942, "step": 1673 }, { "epoch": 1.7714285714285714, "grad_norm": 0.22258782531542667, "learning_rate": 2.2775382203057626e-05, "loss": 0.4511, "step": 1674 }, { "epoch": 1.7724867724867726, "grad_norm": 0.2323804687576553, "learning_rate": 2.275578204625637e-05, "loss": 0.439, "step": 1675 }, { "epoch": 1.7735449735449735, "grad_norm": 0.22142223429635433, "learning_rate": 2.2736181889455118e-05, "loss": 0.4404, "step": 1676 }, { "epoch": 1.7746031746031745, "grad_norm": 0.24472248169409241, "learning_rate": 2.2716581732653863e-05, "loss": 0.4276, "step": 1677 }, { "epoch": 1.7756613756613757, "grad_norm": 0.49844277498031275, "learning_rate": 2.269698157585261e-05, "loss": 0.3956, "step": 1678 }, { "epoch": 1.7767195767195767, "grad_norm": 0.228348531852297, "learning_rate": 2.2677381419051352e-05, "loss": 0.4511, "step": 1679 }, { "epoch": 1.7777777777777777, "grad_norm": 0.2886498501182977, "learning_rate": 2.26577812622501e-05, "loss": 0.4456, "step": 1680 }, { "epoch": 1.7788359788359789, "grad_norm": 4.492214718550815, "learning_rate": 2.2638181105448844e-05, "loss": 0.5011, "step": 1681 }, { "epoch": 1.7798941798941799, "grad_norm": 0.24204760220136187, "learning_rate": 2.2618580948647593e-05, "loss": 0.4279, "step": 1682 }, { "epoch": 1.7809523809523808, "grad_norm": 0.23756281310500446, "learning_rate": 2.2598980791846335e-05, "loss": 0.4534, "step": 1683 }, { "epoch": 1.782010582010582, "grad_norm": 0.21980783401447584, "learning_rate": 2.257938063504508e-05, "loss": 0.4446, "step": 1684 }, { "epoch": 1.783068783068783, "grad_norm": 0.2087115413343064, "learning_rate": 2.2559780478243827e-05, "loss": 0.3605, "step": 1685 }, { "epoch": 1.784126984126984, "grad_norm": 0.2543465932738077, "learning_rate": 2.2540180321442573e-05, "loss": 0.4842, "step": 1686 }, { "epoch": 1.7851851851851852, "grad_norm": 0.22912923330057322, "learning_rate": 2.252058016464132e-05, "loss": 0.486, "step": 1687 }, { "epoch": 1.7862433862433862, "grad_norm": 0.21781329156771057, "learning_rate": 2.2500980007840065e-05, "loss": 0.4683, "step": 1688 }, { "epoch": 1.7873015873015872, "grad_norm": 0.21952765550999895, "learning_rate": 2.2481379851038807e-05, "loss": 0.449, "step": 1689 }, { "epoch": 1.7883597883597884, "grad_norm": 0.2419599965780607, "learning_rate": 2.2461779694237556e-05, "loss": 0.5227, "step": 1690 }, { "epoch": 1.7894179894179896, "grad_norm": 0.23359674854150045, "learning_rate": 2.24421795374363e-05, "loss": 0.4485, "step": 1691 }, { "epoch": 1.7904761904761903, "grad_norm": 0.21983049948624178, "learning_rate": 2.2422579380635045e-05, "loss": 0.4553, "step": 1692 }, { "epoch": 1.7915343915343915, "grad_norm": 0.22263523405847083, "learning_rate": 2.240297922383379e-05, "loss": 0.4627, "step": 1693 }, { "epoch": 1.7925925925925927, "grad_norm": 0.21875294678933027, "learning_rate": 2.2383379067032536e-05, "loss": 0.452, "step": 1694 }, { "epoch": 1.7936507936507935, "grad_norm": 0.2190892904324516, "learning_rate": 2.2363778910231282e-05, "loss": 0.4176, "step": 1695 }, { "epoch": 1.7947089947089947, "grad_norm": 0.20100867962420854, "learning_rate": 2.2344178753430028e-05, "loss": 0.4424, "step": 1696 }, { "epoch": 1.795767195767196, "grad_norm": 0.22957554331685712, "learning_rate": 2.2324578596628774e-05, "loss": 0.471, "step": 1697 }, { "epoch": 1.7968253968253967, "grad_norm": 0.22879161164574066, "learning_rate": 2.230497843982752e-05, "loss": 0.497, "step": 1698 }, { "epoch": 1.7978835978835979, "grad_norm": 0.19578829621547492, "learning_rate": 2.2285378283026266e-05, "loss": 0.3854, "step": 1699 }, { "epoch": 1.798941798941799, "grad_norm": 0.21006387291689507, "learning_rate": 2.2265778126225012e-05, "loss": 0.4098, "step": 1700 }, { "epoch": 1.8, "grad_norm": 0.23977296269604864, "learning_rate": 2.2246177969423758e-05, "loss": 0.4905, "step": 1701 }, { "epoch": 1.801058201058201, "grad_norm": 0.2105622462139618, "learning_rate": 2.22265778126225e-05, "loss": 0.4283, "step": 1702 }, { "epoch": 1.8021164021164022, "grad_norm": 0.22056822514598742, "learning_rate": 2.220697765582125e-05, "loss": 0.4205, "step": 1703 }, { "epoch": 1.8031746031746032, "grad_norm": 0.20582405401017348, "learning_rate": 2.2187377499019992e-05, "loss": 0.4471, "step": 1704 }, { "epoch": 1.8042328042328042, "grad_norm": 0.24182764269625914, "learning_rate": 2.216777734221874e-05, "loss": 0.4553, "step": 1705 }, { "epoch": 1.8052910052910054, "grad_norm": 0.20679090700136382, "learning_rate": 2.2148177185417484e-05, "loss": 0.451, "step": 1706 }, { "epoch": 1.8063492063492064, "grad_norm": 0.19092027693402705, "learning_rate": 2.212857702861623e-05, "loss": 0.3698, "step": 1707 }, { "epoch": 1.8074074074074074, "grad_norm": 0.23205314819347692, "learning_rate": 2.2108976871814975e-05, "loss": 0.4511, "step": 1708 }, { "epoch": 1.8084656084656086, "grad_norm": 0.21244419197626072, "learning_rate": 2.208937671501372e-05, "loss": 0.426, "step": 1709 }, { "epoch": 1.8095238095238095, "grad_norm": 0.22435395018849072, "learning_rate": 2.2069776558212467e-05, "loss": 0.4626, "step": 1710 }, { "epoch": 1.8105820105820105, "grad_norm": 0.24022212831540976, "learning_rate": 2.2050176401411213e-05, "loss": 0.4383, "step": 1711 }, { "epoch": 1.8116402116402117, "grad_norm": 0.20705066488075852, "learning_rate": 2.203057624460996e-05, "loss": 0.4122, "step": 1712 }, { "epoch": 1.8126984126984127, "grad_norm": 0.22592755674231862, "learning_rate": 2.2010976087808705e-05, "loss": 0.4842, "step": 1713 }, { "epoch": 1.8137566137566137, "grad_norm": 0.21271788549637888, "learning_rate": 2.199137593100745e-05, "loss": 0.4389, "step": 1714 }, { "epoch": 1.8148148148148149, "grad_norm": 0.23889518702932439, "learning_rate": 2.1971775774206193e-05, "loss": 0.4717, "step": 1715 }, { "epoch": 1.8158730158730159, "grad_norm": 0.2069283317504962, "learning_rate": 2.1952175617404942e-05, "loss": 0.4386, "step": 1716 }, { "epoch": 1.8169312169312168, "grad_norm": 0.20579833750263932, "learning_rate": 2.1932575460603685e-05, "loss": 0.4147, "step": 1717 }, { "epoch": 1.817989417989418, "grad_norm": 0.2249576933304194, "learning_rate": 2.1912975303802434e-05, "loss": 0.4128, "step": 1718 }, { "epoch": 1.819047619047619, "grad_norm": 0.24258567681504786, "learning_rate": 2.1893375147001177e-05, "loss": 0.496, "step": 1719 }, { "epoch": 1.82010582010582, "grad_norm": 2.4445652502189414, "learning_rate": 2.1873774990199922e-05, "loss": 0.5013, "step": 1720 }, { "epoch": 1.8211640211640212, "grad_norm": 0.22937999540510728, "learning_rate": 2.185417483339867e-05, "loss": 0.458, "step": 1721 }, { "epoch": 1.8222222222222222, "grad_norm": 0.22188766496336873, "learning_rate": 2.1834574676597414e-05, "loss": 0.4118, "step": 1722 }, { "epoch": 1.8232804232804232, "grad_norm": 0.20985186497122293, "learning_rate": 2.1814974519796157e-05, "loss": 0.4202, "step": 1723 }, { "epoch": 1.8243386243386244, "grad_norm": 5.620824290236189, "learning_rate": 2.1795374362994906e-05, "loss": 0.6836, "step": 1724 }, { "epoch": 1.8253968253968254, "grad_norm": 0.2732860325354266, "learning_rate": 2.177577420619365e-05, "loss": 0.5006, "step": 1725 }, { "epoch": 1.8264550264550263, "grad_norm": 0.2518168942236389, "learning_rate": 2.1756174049392398e-05, "loss": 0.4803, "step": 1726 }, { "epoch": 1.8275132275132275, "grad_norm": 0.30831471115695636, "learning_rate": 2.173657389259114e-05, "loss": 0.4059, "step": 1727 }, { "epoch": 1.8285714285714287, "grad_norm": 0.2518548837185105, "learning_rate": 2.1716973735789886e-05, "loss": 0.4423, "step": 1728 }, { "epoch": 1.8296296296296295, "grad_norm": 0.23578369543941782, "learning_rate": 2.1697373578988632e-05, "loss": 0.4302, "step": 1729 }, { "epoch": 1.8306878306878307, "grad_norm": 0.2417044417728648, "learning_rate": 2.1677773422187378e-05, "loss": 0.4478, "step": 1730 }, { "epoch": 1.831746031746032, "grad_norm": 0.21978275736190653, "learning_rate": 2.1658173265386124e-05, "loss": 0.4316, "step": 1731 }, { "epoch": 1.8328042328042327, "grad_norm": 0.22025067994263706, "learning_rate": 2.163857310858487e-05, "loss": 0.4013, "step": 1732 }, { "epoch": 1.8338624338624339, "grad_norm": 0.23323951970705392, "learning_rate": 2.1618972951783615e-05, "loss": 0.4603, "step": 1733 }, { "epoch": 1.834920634920635, "grad_norm": 0.2120845002669665, "learning_rate": 2.159937279498236e-05, "loss": 0.4597, "step": 1734 }, { "epoch": 1.8359788359788358, "grad_norm": 0.22367193469028424, "learning_rate": 2.1579772638181107e-05, "loss": 0.4438, "step": 1735 }, { "epoch": 1.837037037037037, "grad_norm": 0.21769442259199298, "learning_rate": 2.1560172481379853e-05, "loss": 0.4325, "step": 1736 }, { "epoch": 1.8380952380952382, "grad_norm": 0.2248745754640526, "learning_rate": 2.15405723245786e-05, "loss": 0.432, "step": 1737 }, { "epoch": 1.8391534391534392, "grad_norm": 0.21085112873884912, "learning_rate": 2.152097216777734e-05, "loss": 0.4293, "step": 1738 }, { "epoch": 1.8402116402116402, "grad_norm": 0.20872522105623528, "learning_rate": 2.150137201097609e-05, "loss": 0.4546, "step": 1739 }, { "epoch": 1.8412698412698414, "grad_norm": 0.2285263541285715, "learning_rate": 2.1481771854174833e-05, "loss": 0.4051, "step": 1740 }, { "epoch": 1.8423280423280424, "grad_norm": 0.22784803646681845, "learning_rate": 2.146217169737358e-05, "loss": 0.446, "step": 1741 }, { "epoch": 1.8433862433862434, "grad_norm": 0.26036748975604307, "learning_rate": 2.1442571540572325e-05, "loss": 0.4154, "step": 1742 }, { "epoch": 1.8444444444444446, "grad_norm": 0.2387042021248444, "learning_rate": 2.142297138377107e-05, "loss": 0.4309, "step": 1743 }, { "epoch": 1.8455026455026455, "grad_norm": 0.2182840280137568, "learning_rate": 2.1403371226969817e-05, "loss": 0.4367, "step": 1744 }, { "epoch": 1.8465608465608465, "grad_norm": 0.22907755338440494, "learning_rate": 2.1383771070168563e-05, "loss": 0.4255, "step": 1745 }, { "epoch": 1.8476190476190477, "grad_norm": 0.20993653159537567, "learning_rate": 2.136417091336731e-05, "loss": 0.4389, "step": 1746 }, { "epoch": 1.8486772486772487, "grad_norm": 0.21338544578060106, "learning_rate": 2.1344570756566054e-05, "loss": 0.461, "step": 1747 }, { "epoch": 1.8497354497354497, "grad_norm": 0.21341559351141076, "learning_rate": 2.13249705997648e-05, "loss": 0.4839, "step": 1748 }, { "epoch": 1.8507936507936509, "grad_norm": 0.24079068460679487, "learning_rate": 2.1305370442963546e-05, "loss": 0.4823, "step": 1749 }, { "epoch": 1.8518518518518519, "grad_norm": 0.20444170286570748, "learning_rate": 2.1285770286162292e-05, "loss": 0.4461, "step": 1750 }, { "epoch": 1.8529100529100528, "grad_norm": 0.2387410951821536, "learning_rate": 2.1266170129361034e-05, "loss": 0.5042, "step": 1751 }, { "epoch": 1.853968253968254, "grad_norm": 0.19937829720047517, "learning_rate": 2.1246569972559784e-05, "loss": 0.3733, "step": 1752 }, { "epoch": 1.855026455026455, "grad_norm": 0.20349986087488134, "learning_rate": 2.1226969815758526e-05, "loss": 0.4192, "step": 1753 }, { "epoch": 1.856084656084656, "grad_norm": 0.19942778332550556, "learning_rate": 2.1207369658957275e-05, "loss": 0.4123, "step": 1754 }, { "epoch": 1.8571428571428572, "grad_norm": 0.2395297045424899, "learning_rate": 2.1187769502156018e-05, "loss": 0.4858, "step": 1755 }, { "epoch": 1.8582010582010582, "grad_norm": 0.20249034937090846, "learning_rate": 2.1168169345354764e-05, "loss": 0.4741, "step": 1756 }, { "epoch": 1.8592592592592592, "grad_norm": 0.22048497676461043, "learning_rate": 2.114856918855351e-05, "loss": 0.45, "step": 1757 }, { "epoch": 1.8603174603174604, "grad_norm": 0.22049360912083787, "learning_rate": 2.1128969031752256e-05, "loss": 0.5033, "step": 1758 }, { "epoch": 1.8613756613756614, "grad_norm": 0.22074914003644552, "learning_rate": 2.1109368874950998e-05, "loss": 0.4399, "step": 1759 }, { "epoch": 1.8624338624338623, "grad_norm": 0.2160312595040178, "learning_rate": 2.1089768718149747e-05, "loss": 0.4702, "step": 1760 }, { "epoch": 1.8634920634920635, "grad_norm": 0.206865371111694, "learning_rate": 2.107016856134849e-05, "loss": 0.4652, "step": 1761 }, { "epoch": 1.8645502645502645, "grad_norm": 0.2091592072021074, "learning_rate": 2.105056840454724e-05, "loss": 0.4182, "step": 1762 }, { "epoch": 1.8656084656084655, "grad_norm": 0.23339044723826416, "learning_rate": 2.103096824774598e-05, "loss": 0.5277, "step": 1763 }, { "epoch": 1.8666666666666667, "grad_norm": 0.4460094384942643, "learning_rate": 2.1011368090944727e-05, "loss": 0.4579, "step": 1764 }, { "epoch": 1.8677248677248677, "grad_norm": 0.2100939143704434, "learning_rate": 2.0991767934143473e-05, "loss": 0.4655, "step": 1765 }, { "epoch": 1.8687830687830687, "grad_norm": 0.23434130765094305, "learning_rate": 2.097216777734222e-05, "loss": 0.5209, "step": 1766 }, { "epoch": 1.8698412698412699, "grad_norm": 0.2434629762626319, "learning_rate": 2.0952567620540965e-05, "loss": 0.4495, "step": 1767 }, { "epoch": 1.870899470899471, "grad_norm": 0.21781894326153597, "learning_rate": 2.093296746373971e-05, "loss": 0.4462, "step": 1768 }, { "epoch": 1.8719576719576718, "grad_norm": 0.20899498020213622, "learning_rate": 2.0913367306938457e-05, "loss": 0.4457, "step": 1769 }, { "epoch": 1.873015873015873, "grad_norm": 0.2167051331331649, "learning_rate": 2.0893767150137203e-05, "loss": 0.4375, "step": 1770 }, { "epoch": 1.8740740740740742, "grad_norm": 0.2240331588887899, "learning_rate": 2.087416699333595e-05, "loss": 0.5185, "step": 1771 }, { "epoch": 1.875132275132275, "grad_norm": 0.22090206678208632, "learning_rate": 2.085456683653469e-05, "loss": 0.4808, "step": 1772 }, { "epoch": 1.8761904761904762, "grad_norm": 0.22040696733634577, "learning_rate": 2.083496667973344e-05, "loss": 0.456, "step": 1773 }, { "epoch": 1.8772486772486774, "grad_norm": 0.2079955864538246, "learning_rate": 2.0815366522932183e-05, "loss": 0.4143, "step": 1774 }, { "epoch": 1.8783068783068781, "grad_norm": 0.22169752369395546, "learning_rate": 2.0795766366130932e-05, "loss": 0.4858, "step": 1775 }, { "epoch": 1.8793650793650793, "grad_norm": 0.2402070315956562, "learning_rate": 2.0776166209329674e-05, "loss": 0.4504, "step": 1776 }, { "epoch": 1.8804232804232806, "grad_norm": 0.23435927645511864, "learning_rate": 2.075656605252842e-05, "loss": 0.4593, "step": 1777 }, { "epoch": 1.8814814814814815, "grad_norm": 0.20818590974484366, "learning_rate": 2.0736965895727166e-05, "loss": 0.4381, "step": 1778 }, { "epoch": 1.8825396825396825, "grad_norm": 0.2130354907179979, "learning_rate": 2.0717365738925912e-05, "loss": 0.4297, "step": 1779 }, { "epoch": 1.8835978835978837, "grad_norm": 0.21660044821589397, "learning_rate": 2.0697765582124658e-05, "loss": 0.4697, "step": 1780 }, { "epoch": 1.8846560846560847, "grad_norm": 0.20309183557121804, "learning_rate": 2.0678165425323404e-05, "loss": 0.4025, "step": 1781 }, { "epoch": 1.8857142857142857, "grad_norm": 0.21087610186232195, "learning_rate": 2.065856526852215e-05, "loss": 0.4546, "step": 1782 }, { "epoch": 1.8867724867724869, "grad_norm": 0.22732457303926343, "learning_rate": 2.0638965111720896e-05, "loss": 0.4668, "step": 1783 }, { "epoch": 1.8878306878306879, "grad_norm": 0.22180452040864543, "learning_rate": 2.061936495491964e-05, "loss": 0.4584, "step": 1784 }, { "epoch": 1.8888888888888888, "grad_norm": 0.21261990698971484, "learning_rate": 2.0599764798118387e-05, "loss": 0.4406, "step": 1785 }, { "epoch": 1.88994708994709, "grad_norm": 0.21071003692457904, "learning_rate": 2.0580164641317133e-05, "loss": 0.4143, "step": 1786 }, { "epoch": 1.891005291005291, "grad_norm": 0.2204518775713819, "learning_rate": 2.0560564484515876e-05, "loss": 0.4135, "step": 1787 }, { "epoch": 1.892063492063492, "grad_norm": 0.2078948195319659, "learning_rate": 2.0540964327714625e-05, "loss": 0.3921, "step": 1788 }, { "epoch": 1.8931216931216932, "grad_norm": 0.2196279086951217, "learning_rate": 2.0521364170913367e-05, "loss": 0.3873, "step": 1789 }, { "epoch": 1.8941798941798942, "grad_norm": 0.24347249045633615, "learning_rate": 2.0501764014112113e-05, "loss": 0.4223, "step": 1790 }, { "epoch": 1.8952380952380952, "grad_norm": 0.21502367020868784, "learning_rate": 2.048216385731086e-05, "loss": 0.4241, "step": 1791 }, { "epoch": 1.8962962962962964, "grad_norm": 0.2223665065572327, "learning_rate": 2.0462563700509605e-05, "loss": 0.4585, "step": 1792 }, { "epoch": 1.8973544973544973, "grad_norm": 0.2210410067693124, "learning_rate": 2.044296354370835e-05, "loss": 0.4468, "step": 1793 }, { "epoch": 1.8984126984126983, "grad_norm": 0.21557122963124056, "learning_rate": 2.0423363386907097e-05, "loss": 0.458, "step": 1794 }, { "epoch": 1.8994708994708995, "grad_norm": 0.24519100116436493, "learning_rate": 2.040376323010584e-05, "loss": 0.4466, "step": 1795 }, { "epoch": 1.9005291005291005, "grad_norm": 0.24801432016192543, "learning_rate": 2.038416307330459e-05, "loss": 0.4839, "step": 1796 }, { "epoch": 1.9015873015873015, "grad_norm": 0.20049577359290413, "learning_rate": 2.036456291650333e-05, "loss": 0.4127, "step": 1797 }, { "epoch": 1.9026455026455027, "grad_norm": 0.23520752147502938, "learning_rate": 2.034496275970208e-05, "loss": 0.448, "step": 1798 }, { "epoch": 1.9037037037037037, "grad_norm": 0.24490985101151913, "learning_rate": 2.0325362602900823e-05, "loss": 0.431, "step": 1799 }, { "epoch": 1.9047619047619047, "grad_norm": 0.2664715652435904, "learning_rate": 2.030576244609957e-05, "loss": 0.4766, "step": 1800 }, { "epoch": 1.9058201058201059, "grad_norm": 0.2485082923716217, "learning_rate": 2.0286162289298315e-05, "loss": 0.573, "step": 1801 }, { "epoch": 1.9068783068783068, "grad_norm": 0.24251809787369136, "learning_rate": 2.026656213249706e-05, "loss": 0.4245, "step": 1802 }, { "epoch": 1.9079365079365078, "grad_norm": 0.27387491257447555, "learning_rate": 2.0246961975695806e-05, "loss": 0.4171, "step": 1803 }, { "epoch": 1.908994708994709, "grad_norm": 0.23047285152697722, "learning_rate": 2.0227361818894552e-05, "loss": 0.493, "step": 1804 }, { "epoch": 1.91005291005291, "grad_norm": 7.990646854444911, "learning_rate": 2.0207761662093298e-05, "loss": 0.635, "step": 1805 }, { "epoch": 1.911111111111111, "grad_norm": 0.26163599966519524, "learning_rate": 2.0188161505292044e-05, "loss": 0.4351, "step": 1806 }, { "epoch": 1.9121693121693122, "grad_norm": 0.25130856546308006, "learning_rate": 2.016856134849079e-05, "loss": 0.4399, "step": 1807 }, { "epoch": 1.9132275132275134, "grad_norm": 0.20476985393616431, "learning_rate": 2.0148961191689532e-05, "loss": 0.4158, "step": 1808 }, { "epoch": 1.9142857142857141, "grad_norm": 0.22396937696533817, "learning_rate": 2.012936103488828e-05, "loss": 0.4953, "step": 1809 }, { "epoch": 1.9153439153439153, "grad_norm": 0.2100685359464842, "learning_rate": 2.0109760878087024e-05, "loss": 0.4604, "step": 1810 }, { "epoch": 1.9164021164021166, "grad_norm": 0.2347068667100795, "learning_rate": 2.0090160721285773e-05, "loss": 0.4656, "step": 1811 }, { "epoch": 1.9174603174603173, "grad_norm": 0.2134340622258882, "learning_rate": 2.0070560564484516e-05, "loss": 0.4155, "step": 1812 }, { "epoch": 1.9185185185185185, "grad_norm": 0.20915306386821994, "learning_rate": 2.005096040768326e-05, "loss": 0.4005, "step": 1813 }, { "epoch": 1.9195767195767197, "grad_norm": 0.2261359347456008, "learning_rate": 2.0031360250882008e-05, "loss": 0.4248, "step": 1814 }, { "epoch": 1.9206349206349205, "grad_norm": 0.21813479888223342, "learning_rate": 2.0011760094080753e-05, "loss": 0.4638, "step": 1815 }, { "epoch": 1.9216931216931217, "grad_norm": 0.2101057981170796, "learning_rate": 1.99921599372795e-05, "loss": 0.4028, "step": 1816 }, { "epoch": 1.9227513227513229, "grad_norm": 0.22511459770933143, "learning_rate": 1.9972559780478245e-05, "loss": 0.5111, "step": 1817 }, { "epoch": 1.9238095238095239, "grad_norm": 0.23388911916732105, "learning_rate": 1.995295962367699e-05, "loss": 0.4683, "step": 1818 }, { "epoch": 1.9248677248677248, "grad_norm": 0.22435740200537527, "learning_rate": 1.9933359466875737e-05, "loss": 0.438, "step": 1819 }, { "epoch": 1.925925925925926, "grad_norm": 0.21611950819751236, "learning_rate": 1.9913759310074483e-05, "loss": 0.378, "step": 1820 }, { "epoch": 1.926984126984127, "grad_norm": 0.20291845845078743, "learning_rate": 1.9894159153273225e-05, "loss": 0.4277, "step": 1821 }, { "epoch": 1.928042328042328, "grad_norm": 0.22847172320277195, "learning_rate": 1.9874558996471975e-05, "loss": 0.4674, "step": 1822 }, { "epoch": 1.9291005291005292, "grad_norm": 0.23053558937849467, "learning_rate": 1.9854958839670717e-05, "loss": 0.4015, "step": 1823 }, { "epoch": 1.9301587301587302, "grad_norm": 0.23708550014377422, "learning_rate": 1.9835358682869466e-05, "loss": 0.4684, "step": 1824 }, { "epoch": 1.9312169312169312, "grad_norm": 0.316817125246121, "learning_rate": 1.981575852606821e-05, "loss": 0.4263, "step": 1825 }, { "epoch": 1.9322751322751324, "grad_norm": 0.2254829079471954, "learning_rate": 1.9796158369266955e-05, "loss": 0.4851, "step": 1826 }, { "epoch": 1.9333333333333333, "grad_norm": 0.2316587670641986, "learning_rate": 1.97765582124657e-05, "loss": 0.4229, "step": 1827 }, { "epoch": 1.9343915343915343, "grad_norm": 0.21297157984215134, "learning_rate": 1.9756958055664446e-05, "loss": 0.4415, "step": 1828 }, { "epoch": 1.9354497354497355, "grad_norm": 0.2612957037749709, "learning_rate": 1.9737357898863192e-05, "loss": 0.4589, "step": 1829 }, { "epoch": 1.9365079365079365, "grad_norm": 0.2129663181734509, "learning_rate": 1.9717757742061938e-05, "loss": 0.4511, "step": 1830 }, { "epoch": 1.9375661375661375, "grad_norm": 0.23503195721500833, "learning_rate": 1.969815758526068e-05, "loss": 0.4617, "step": 1831 }, { "epoch": 1.9386243386243387, "grad_norm": 0.2287134737206407, "learning_rate": 1.967855742845943e-05, "loss": 0.471, "step": 1832 }, { "epoch": 1.9396825396825397, "grad_norm": 0.21262604276669486, "learning_rate": 1.9658957271658172e-05, "loss": 0.47, "step": 1833 }, { "epoch": 1.9407407407407407, "grad_norm": 0.220344769052406, "learning_rate": 1.963935711485692e-05, "loss": 0.4505, "step": 1834 }, { "epoch": 1.9417989417989419, "grad_norm": 0.220152899947779, "learning_rate": 1.9619756958055664e-05, "loss": 0.4263, "step": 1835 }, { "epoch": 1.9428571428571428, "grad_norm": 0.20366436651004108, "learning_rate": 1.960015680125441e-05, "loss": 0.3771, "step": 1836 }, { "epoch": 1.9439153439153438, "grad_norm": 0.2200894429401924, "learning_rate": 1.9580556644453156e-05, "loss": 0.4203, "step": 1837 }, { "epoch": 1.944973544973545, "grad_norm": 0.21590610062905347, "learning_rate": 1.9560956487651902e-05, "loss": 0.4558, "step": 1838 }, { "epoch": 1.946031746031746, "grad_norm": 0.22356270505912842, "learning_rate": 1.9541356330850648e-05, "loss": 0.4233, "step": 1839 }, { "epoch": 1.947089947089947, "grad_norm": 0.20737807332328057, "learning_rate": 1.9521756174049394e-05, "loss": 0.3993, "step": 1840 }, { "epoch": 1.9481481481481482, "grad_norm": 0.21342097386972908, "learning_rate": 1.950215601724814e-05, "loss": 0.4127, "step": 1841 }, { "epoch": 1.9492063492063492, "grad_norm": 0.2031800844818702, "learning_rate": 1.9482555860446885e-05, "loss": 0.383, "step": 1842 }, { "epoch": 1.9502645502645501, "grad_norm": 0.22744285466424116, "learning_rate": 1.946295570364563e-05, "loss": 0.4748, "step": 1843 }, { "epoch": 1.9513227513227513, "grad_norm": 0.23141003857099457, "learning_rate": 1.9443355546844374e-05, "loss": 0.5051, "step": 1844 }, { "epoch": 1.9523809523809523, "grad_norm": 0.2145215316356645, "learning_rate": 1.9423755390043123e-05, "loss": 0.4225, "step": 1845 }, { "epoch": 1.9534391534391533, "grad_norm": 0.2026905601029007, "learning_rate": 1.9404155233241865e-05, "loss": 0.4007, "step": 1846 }, { "epoch": 1.9544973544973545, "grad_norm": 0.22999532724274202, "learning_rate": 1.9384555076440615e-05, "loss": 0.4617, "step": 1847 }, { "epoch": 1.9555555555555557, "grad_norm": 0.2082995676826002, "learning_rate": 1.9364954919639357e-05, "loss": 0.4278, "step": 1848 }, { "epoch": 1.9566137566137565, "grad_norm": 0.22475386311924242, "learning_rate": 1.9345354762838103e-05, "loss": 0.4241, "step": 1849 }, { "epoch": 1.9576719576719577, "grad_norm": 0.22479359963728507, "learning_rate": 1.932575460603685e-05, "loss": 0.4338, "step": 1850 }, { "epoch": 1.9587301587301589, "grad_norm": 0.20367965304182695, "learning_rate": 1.9306154449235595e-05, "loss": 0.3818, "step": 1851 }, { "epoch": 1.9597883597883596, "grad_norm": 0.21527243201486918, "learning_rate": 1.928655429243434e-05, "loss": 0.3774, "step": 1852 }, { "epoch": 1.9608465608465608, "grad_norm": 0.2249500078225003, "learning_rate": 1.9266954135633087e-05, "loss": 0.4967, "step": 1853 }, { "epoch": 1.961904761904762, "grad_norm": 0.21710410127581037, "learning_rate": 1.9247353978831832e-05, "loss": 0.456, "step": 1854 }, { "epoch": 1.9629629629629628, "grad_norm": 0.21285160925599858, "learning_rate": 1.9227753822030578e-05, "loss": 0.4345, "step": 1855 }, { "epoch": 1.964021164021164, "grad_norm": 0.19869936479793823, "learning_rate": 1.9208153665229324e-05, "loss": 0.4267, "step": 1856 }, { "epoch": 1.9650793650793652, "grad_norm": 0.20189396892728878, "learning_rate": 1.9188553508428067e-05, "loss": 0.4246, "step": 1857 }, { "epoch": 1.9661375661375662, "grad_norm": 0.21026152529101638, "learning_rate": 1.9168953351626816e-05, "loss": 0.4074, "step": 1858 }, { "epoch": 1.9671957671957672, "grad_norm": 0.2143954764423591, "learning_rate": 1.914935319482556e-05, "loss": 0.4739, "step": 1859 }, { "epoch": 1.9682539682539684, "grad_norm": 0.19350444830703373, "learning_rate": 1.9129753038024308e-05, "loss": 0.3873, "step": 1860 }, { "epoch": 1.9693121693121693, "grad_norm": 0.19855752381553382, "learning_rate": 1.911015288122305e-05, "loss": 0.4127, "step": 1861 }, { "epoch": 1.9703703703703703, "grad_norm": 0.21567120927747255, "learning_rate": 1.9090552724421796e-05, "loss": 0.4648, "step": 1862 }, { "epoch": 1.9714285714285715, "grad_norm": 0.21153824863129872, "learning_rate": 1.9070952567620542e-05, "loss": 0.4796, "step": 1863 }, { "epoch": 1.9724867724867725, "grad_norm": 0.19320130815422823, "learning_rate": 1.9051352410819288e-05, "loss": 0.4275, "step": 1864 }, { "epoch": 1.9735449735449735, "grad_norm": 0.19887236706758227, "learning_rate": 1.9031752254018034e-05, "loss": 0.4148, "step": 1865 }, { "epoch": 1.9746031746031747, "grad_norm": 0.1932406080084438, "learning_rate": 1.901215209721678e-05, "loss": 0.4067, "step": 1866 }, { "epoch": 1.9756613756613757, "grad_norm": 0.21807163033247312, "learning_rate": 1.8992551940415522e-05, "loss": 0.4882, "step": 1867 }, { "epoch": 1.9767195767195767, "grad_norm": 0.20618559739939532, "learning_rate": 1.897295178361427e-05, "loss": 0.4276, "step": 1868 }, { "epoch": 1.9777777777777779, "grad_norm": 0.19090448828455922, "learning_rate": 1.8953351626813014e-05, "loss": 0.4043, "step": 1869 }, { "epoch": 1.9788359788359788, "grad_norm": 0.1947353451525142, "learning_rate": 1.893375147001176e-05, "loss": 0.4039, "step": 1870 }, { "epoch": 1.9798941798941798, "grad_norm": 0.22563886214800088, "learning_rate": 1.8914151313210505e-05, "loss": 0.4928, "step": 1871 }, { "epoch": 1.980952380952381, "grad_norm": 0.24654476642054673, "learning_rate": 1.889455115640925e-05, "loss": 0.5016, "step": 1872 }, { "epoch": 1.982010582010582, "grad_norm": 0.21770134373827096, "learning_rate": 1.8874950999607997e-05, "loss": 0.4636, "step": 1873 }, { "epoch": 1.983068783068783, "grad_norm": 0.21078921223109962, "learning_rate": 1.8855350842806743e-05, "loss": 0.4529, "step": 1874 }, { "epoch": 1.9841269841269842, "grad_norm": 0.2042699274002301, "learning_rate": 1.883575068600549e-05, "loss": 0.4443, "step": 1875 }, { "epoch": 1.9851851851851852, "grad_norm": 0.20988400394066842, "learning_rate": 1.8816150529204235e-05, "loss": 0.4441, "step": 1876 }, { "epoch": 1.9862433862433861, "grad_norm": 0.231811845633082, "learning_rate": 1.879655037240298e-05, "loss": 0.4538, "step": 1877 }, { "epoch": 1.9873015873015873, "grad_norm": 0.19558466017276782, "learning_rate": 1.8776950215601727e-05, "loss": 0.4031, "step": 1878 }, { "epoch": 1.9883597883597883, "grad_norm": 0.20830804273827055, "learning_rate": 1.8757350058800472e-05, "loss": 0.454, "step": 1879 }, { "epoch": 1.9894179894179893, "grad_norm": 0.2316902360278231, "learning_rate": 1.8737749901999215e-05, "loss": 0.4099, "step": 1880 }, { "epoch": 1.9904761904761905, "grad_norm": 0.6294053779165485, "learning_rate": 1.8718149745197964e-05, "loss": 0.5239, "step": 1881 }, { "epoch": 1.9915343915343915, "grad_norm": 0.1914593214711506, "learning_rate": 1.8698549588396707e-05, "loss": 0.3855, "step": 1882 }, { "epoch": 1.9925925925925925, "grad_norm": 0.21642178461292894, "learning_rate": 1.8678949431595456e-05, "loss": 0.467, "step": 1883 }, { "epoch": 1.9936507936507937, "grad_norm": 0.23743786345884751, "learning_rate": 1.86593492747942e-05, "loss": 0.4487, "step": 1884 }, { "epoch": 1.9947089947089947, "grad_norm": 0.23198695415553083, "learning_rate": 1.8639749117992944e-05, "loss": 0.5057, "step": 1885 }, { "epoch": 1.9957671957671956, "grad_norm": 0.3471270912766888, "learning_rate": 1.862014896119169e-05, "loss": 0.3937, "step": 1886 }, { "epoch": 1.9968253968253968, "grad_norm": 0.22576252798533672, "learning_rate": 1.8600548804390436e-05, "loss": 0.4228, "step": 1887 }, { "epoch": 1.997883597883598, "grad_norm": 0.21560388020282445, "learning_rate": 1.8580948647589182e-05, "loss": 0.4147, "step": 1888 }, { "epoch": 1.9989417989417988, "grad_norm": 0.22358325111497787, "learning_rate": 1.8561348490787928e-05, "loss": 0.4384, "step": 1889 }, { "epoch": 2.0, "grad_norm": 0.22079447072147176, "learning_rate": 1.8541748333986674e-05, "loss": 0.4229, "step": 1890 }, { "epoch": 2.001058201058201, "grad_norm": 0.298496877542824, "learning_rate": 1.852214817718542e-05, "loss": 0.3341, "step": 1891 }, { "epoch": 2.002116402116402, "grad_norm": 0.2769200047722302, "learning_rate": 1.8502548020384165e-05, "loss": 0.3833, "step": 1892 }, { "epoch": 2.003174603174603, "grad_norm": 0.29171539792667067, "learning_rate": 1.8482947863582908e-05, "loss": 0.3858, "step": 1893 }, { "epoch": 2.0042328042328044, "grad_norm": 0.2789134735368325, "learning_rate": 1.8463347706781657e-05, "loss": 0.3568, "step": 1894 }, { "epoch": 2.005291005291005, "grad_norm": 0.34633898150173376, "learning_rate": 1.84437475499804e-05, "loss": 0.3465, "step": 1895 }, { "epoch": 2.0063492063492063, "grad_norm": 0.2793278281768595, "learning_rate": 1.842414739317915e-05, "loss": 0.4017, "step": 1896 }, { "epoch": 2.0074074074074075, "grad_norm": 0.3059555073494133, "learning_rate": 1.840454723637789e-05, "loss": 0.3392, "step": 1897 }, { "epoch": 2.0084656084656083, "grad_norm": 0.26671660329309566, "learning_rate": 1.8384947079576637e-05, "loss": 0.3768, "step": 1898 }, { "epoch": 2.0095238095238095, "grad_norm": 0.276118400635007, "learning_rate": 1.8365346922775383e-05, "loss": 0.3967, "step": 1899 }, { "epoch": 2.0105820105820107, "grad_norm": 0.23862871261186194, "learning_rate": 1.834574676597413e-05, "loss": 0.3411, "step": 1900 }, { "epoch": 2.0116402116402115, "grad_norm": 0.2602752683930678, "learning_rate": 1.8326146609172875e-05, "loss": 0.3598, "step": 1901 }, { "epoch": 2.0126984126984127, "grad_norm": 0.24165262446248684, "learning_rate": 1.830654645237162e-05, "loss": 0.3637, "step": 1902 }, { "epoch": 2.013756613756614, "grad_norm": 0.2211835840328273, "learning_rate": 1.8286946295570363e-05, "loss": 0.3475, "step": 1903 }, { "epoch": 2.0148148148148146, "grad_norm": 0.2536735392338854, "learning_rate": 1.8267346138769113e-05, "loss": 0.371, "step": 1904 }, { "epoch": 2.015873015873016, "grad_norm": 0.23225659690495537, "learning_rate": 1.8247745981967855e-05, "loss": 0.3231, "step": 1905 }, { "epoch": 2.016931216931217, "grad_norm": 0.2363270132100584, "learning_rate": 1.82281458251666e-05, "loss": 0.4065, "step": 1906 }, { "epoch": 2.0179894179894178, "grad_norm": 0.21698478882185174, "learning_rate": 1.8208545668365347e-05, "loss": 0.372, "step": 1907 }, { "epoch": 2.019047619047619, "grad_norm": 0.23211712957073785, "learning_rate": 1.8188945511564093e-05, "loss": 0.3939, "step": 1908 }, { "epoch": 2.02010582010582, "grad_norm": 0.21859722986589825, "learning_rate": 1.816934535476284e-05, "loss": 0.3096, "step": 1909 }, { "epoch": 2.0211640211640214, "grad_norm": 0.22198129532334243, "learning_rate": 1.8149745197961584e-05, "loss": 0.3603, "step": 1910 }, { "epoch": 2.022222222222222, "grad_norm": 0.20161388915274267, "learning_rate": 1.813014504116033e-05, "loss": 0.3316, "step": 1911 }, { "epoch": 2.0232804232804233, "grad_norm": 0.2169686284719689, "learning_rate": 1.8110544884359076e-05, "loss": 0.3631, "step": 1912 }, { "epoch": 2.0243386243386245, "grad_norm": 0.20992722910943554, "learning_rate": 1.8090944727557822e-05, "loss": 0.354, "step": 1913 }, { "epoch": 2.0253968253968253, "grad_norm": 0.21467603286816797, "learning_rate": 1.8071344570756568e-05, "loss": 0.3362, "step": 1914 }, { "epoch": 2.0264550264550265, "grad_norm": 0.20786206596359347, "learning_rate": 1.8051744413955314e-05, "loss": 0.3244, "step": 1915 }, { "epoch": 2.0275132275132277, "grad_norm": 0.21372256780280968, "learning_rate": 1.8032144257154056e-05, "loss": 0.371, "step": 1916 }, { "epoch": 2.0285714285714285, "grad_norm": 0.21738963933410124, "learning_rate": 1.8012544100352806e-05, "loss": 0.3022, "step": 1917 }, { "epoch": 2.0296296296296297, "grad_norm": 0.20703116847143097, "learning_rate": 1.7992943943551548e-05, "loss": 0.3333, "step": 1918 }, { "epoch": 2.030687830687831, "grad_norm": 0.22313107632965132, "learning_rate": 1.7973343786750294e-05, "loss": 0.3615, "step": 1919 }, { "epoch": 2.0317460317460316, "grad_norm": 0.20832627765877879, "learning_rate": 1.795374362994904e-05, "loss": 0.3038, "step": 1920 }, { "epoch": 2.032804232804233, "grad_norm": 0.21323384648245985, "learning_rate": 1.7934143473147786e-05, "loss": 0.3509, "step": 1921 }, { "epoch": 2.033862433862434, "grad_norm": 0.22323179664908244, "learning_rate": 1.791454331634653e-05, "loss": 0.3586, "step": 1922 }, { "epoch": 2.034920634920635, "grad_norm": 0.2223456687807534, "learning_rate": 1.7894943159545277e-05, "loss": 0.3692, "step": 1923 }, { "epoch": 2.035978835978836, "grad_norm": 0.20164634068506698, "learning_rate": 1.7875343002744023e-05, "loss": 0.3694, "step": 1924 }, { "epoch": 2.037037037037037, "grad_norm": 0.20323586163027332, "learning_rate": 1.785574284594277e-05, "loss": 0.3405, "step": 1925 }, { "epoch": 2.038095238095238, "grad_norm": 0.19413426270312215, "learning_rate": 1.7836142689141515e-05, "loss": 0.3272, "step": 1926 }, { "epoch": 2.039153439153439, "grad_norm": 0.21682781764578435, "learning_rate": 1.781654253234026e-05, "loss": 0.3543, "step": 1927 }, { "epoch": 2.0402116402116404, "grad_norm": 0.20226324872136348, "learning_rate": 1.7796942375539007e-05, "loss": 0.3249, "step": 1928 }, { "epoch": 2.041269841269841, "grad_norm": 0.22193251820682366, "learning_rate": 1.777734221873775e-05, "loss": 0.3416, "step": 1929 }, { "epoch": 2.0423280423280423, "grad_norm": 0.1835440881260506, "learning_rate": 1.77577420619365e-05, "loss": 0.3253, "step": 1930 }, { "epoch": 2.0433862433862435, "grad_norm": 0.23125563586478098, "learning_rate": 1.773814190513524e-05, "loss": 0.3172, "step": 1931 }, { "epoch": 2.0444444444444443, "grad_norm": 0.2192023103480394, "learning_rate": 1.771854174833399e-05, "loss": 0.3675, "step": 1932 }, { "epoch": 2.0455026455026455, "grad_norm": 0.19764009006450245, "learning_rate": 1.7698941591532733e-05, "loss": 0.3108, "step": 1933 }, { "epoch": 2.0465608465608467, "grad_norm": 0.21277420085835824, "learning_rate": 1.767934143473148e-05, "loss": 0.3796, "step": 1934 }, { "epoch": 2.0476190476190474, "grad_norm": 0.2221209417138681, "learning_rate": 1.7659741277930225e-05, "loss": 0.3444, "step": 1935 }, { "epoch": 2.0486772486772487, "grad_norm": 0.208598589107057, "learning_rate": 1.764014112112897e-05, "loss": 0.3416, "step": 1936 }, { "epoch": 2.04973544973545, "grad_norm": 0.19795154801112497, "learning_rate": 1.7620540964327713e-05, "loss": 0.295, "step": 1937 }, { "epoch": 2.0507936507936506, "grad_norm": 0.21861746828164197, "learning_rate": 1.7600940807526462e-05, "loss": 0.3672, "step": 1938 }, { "epoch": 2.051851851851852, "grad_norm": 0.20848142512577938, "learning_rate": 1.7581340650725205e-05, "loss": 0.3503, "step": 1939 }, { "epoch": 2.052910052910053, "grad_norm": 0.20406572548942745, "learning_rate": 1.7561740493923954e-05, "loss": 0.3594, "step": 1940 }, { "epoch": 2.0539682539682538, "grad_norm": 0.22043618465867792, "learning_rate": 1.7542140337122696e-05, "loss": 0.3388, "step": 1941 }, { "epoch": 2.055026455026455, "grad_norm": 0.20857631557038028, "learning_rate": 1.7522540180321442e-05, "loss": 0.3665, "step": 1942 }, { "epoch": 2.056084656084656, "grad_norm": 0.21727483413067578, "learning_rate": 1.7502940023520188e-05, "loss": 0.4127, "step": 1943 }, { "epoch": 2.057142857142857, "grad_norm": 0.23262861070941543, "learning_rate": 1.7483339866718934e-05, "loss": 0.408, "step": 1944 }, { "epoch": 2.058201058201058, "grad_norm": 0.19879438803020888, "learning_rate": 1.746373970991768e-05, "loss": 0.3254, "step": 1945 }, { "epoch": 2.0592592592592593, "grad_norm": 0.21005560252157807, "learning_rate": 1.7444139553116426e-05, "loss": 0.3522, "step": 1946 }, { "epoch": 2.06031746031746, "grad_norm": 0.22966010387646338, "learning_rate": 1.742453939631517e-05, "loss": 0.3749, "step": 1947 }, { "epoch": 2.0613756613756613, "grad_norm": 0.19379588380401883, "learning_rate": 1.7404939239513917e-05, "loss": 0.3103, "step": 1948 }, { "epoch": 2.0624338624338625, "grad_norm": 0.22978352813918101, "learning_rate": 1.7385339082712663e-05, "loss": 0.3673, "step": 1949 }, { "epoch": 2.0634920634920633, "grad_norm": 0.2212514519753634, "learning_rate": 1.736573892591141e-05, "loss": 0.3597, "step": 1950 }, { "epoch": 2.0645502645502645, "grad_norm": 0.20997192399458178, "learning_rate": 1.7346138769110155e-05, "loss": 0.3736, "step": 1951 }, { "epoch": 2.0656084656084657, "grad_norm": 0.20665636238190924, "learning_rate": 1.7326538612308898e-05, "loss": 0.3523, "step": 1952 }, { "epoch": 2.066666666666667, "grad_norm": 0.20758995846695377, "learning_rate": 1.7306938455507647e-05, "loss": 0.3748, "step": 1953 }, { "epoch": 2.0677248677248676, "grad_norm": 0.22564678363361657, "learning_rate": 1.728733829870639e-05, "loss": 0.336, "step": 1954 }, { "epoch": 2.068783068783069, "grad_norm": 0.2516870186148532, "learning_rate": 1.7267738141905135e-05, "loss": 0.4016, "step": 1955 }, { "epoch": 2.06984126984127, "grad_norm": 0.20252710114468478, "learning_rate": 1.724813798510388e-05, "loss": 0.3567, "step": 1956 }, { "epoch": 2.070899470899471, "grad_norm": 0.2282832684870184, "learning_rate": 1.7228537828302627e-05, "loss": 0.3136, "step": 1957 }, { "epoch": 2.071957671957672, "grad_norm": 0.2797927938402852, "learning_rate": 1.7208937671501373e-05, "loss": 0.3932, "step": 1958 }, { "epoch": 2.073015873015873, "grad_norm": 0.20461018617775642, "learning_rate": 1.718933751470012e-05, "loss": 0.3397, "step": 1959 }, { "epoch": 2.074074074074074, "grad_norm": 0.21544569146507456, "learning_rate": 1.7169737357898865e-05, "loss": 0.3519, "step": 1960 }, { "epoch": 2.075132275132275, "grad_norm": 0.22183783680712937, "learning_rate": 1.715013720109761e-05, "loss": 0.3515, "step": 1961 }, { "epoch": 2.0761904761904764, "grad_norm": 0.22146564807753902, "learning_rate": 1.7130537044296356e-05, "loss": 0.3354, "step": 1962 }, { "epoch": 2.077248677248677, "grad_norm": 0.19857023572678106, "learning_rate": 1.7110936887495102e-05, "loss": 0.3213, "step": 1963 }, { "epoch": 2.0783068783068783, "grad_norm": 0.217266771807875, "learning_rate": 1.7091336730693848e-05, "loss": 0.3532, "step": 1964 }, { "epoch": 2.0793650793650795, "grad_norm": 0.20005495691125305, "learning_rate": 1.707173657389259e-05, "loss": 0.3409, "step": 1965 }, { "epoch": 2.0804232804232803, "grad_norm": 0.21330150110247303, "learning_rate": 1.705213641709134e-05, "loss": 0.3741, "step": 1966 }, { "epoch": 2.0814814814814815, "grad_norm": 0.2278780973049878, "learning_rate": 1.7032536260290082e-05, "loss": 0.3468, "step": 1967 }, { "epoch": 2.0825396825396827, "grad_norm": 0.2109353272677484, "learning_rate": 1.7012936103488828e-05, "loss": 0.3471, "step": 1968 }, { "epoch": 2.0835978835978834, "grad_norm": 0.19895351419178237, "learning_rate": 1.6993335946687574e-05, "loss": 0.3492, "step": 1969 }, { "epoch": 2.0846560846560847, "grad_norm": 0.207632263472837, "learning_rate": 1.697373578988632e-05, "loss": 0.3399, "step": 1970 }, { "epoch": 2.085714285714286, "grad_norm": 0.2163337881929324, "learning_rate": 1.6954135633085066e-05, "loss": 0.3255, "step": 1971 }, { "epoch": 2.0867724867724866, "grad_norm": 0.23040692314017927, "learning_rate": 1.6934535476283812e-05, "loss": 0.3606, "step": 1972 }, { "epoch": 2.087830687830688, "grad_norm": 0.21989016356512248, "learning_rate": 1.6914935319482554e-05, "loss": 0.3647, "step": 1973 }, { "epoch": 2.088888888888889, "grad_norm": 0.19613496455842805, "learning_rate": 1.6895335162681303e-05, "loss": 0.3203, "step": 1974 }, { "epoch": 2.0899470899470898, "grad_norm": 0.22567442720889747, "learning_rate": 1.6875735005880046e-05, "loss": 0.3365, "step": 1975 }, { "epoch": 2.091005291005291, "grad_norm": 0.22947949949294724, "learning_rate": 1.6856134849078795e-05, "loss": 0.3584, "step": 1976 }, { "epoch": 2.092063492063492, "grad_norm": 0.23041866230411318, "learning_rate": 1.6836534692277538e-05, "loss": 0.3731, "step": 1977 }, { "epoch": 2.093121693121693, "grad_norm": 0.21985790184342138, "learning_rate": 1.6816934535476284e-05, "loss": 0.3709, "step": 1978 }, { "epoch": 2.094179894179894, "grad_norm": 0.19774950421675602, "learning_rate": 1.679733437867503e-05, "loss": 0.3409, "step": 1979 }, { "epoch": 2.0952380952380953, "grad_norm": 0.2269563084196834, "learning_rate": 1.6777734221873775e-05, "loss": 0.3242, "step": 1980 }, { "epoch": 2.096296296296296, "grad_norm": 0.23498422047692288, "learning_rate": 1.675813406507252e-05, "loss": 0.3583, "step": 1981 }, { "epoch": 2.0973544973544973, "grad_norm": 0.20236703887981325, "learning_rate": 1.6738533908271267e-05, "loss": 0.3557, "step": 1982 }, { "epoch": 2.0984126984126985, "grad_norm": 0.20883374092288098, "learning_rate": 1.6718933751470013e-05, "loss": 0.3285, "step": 1983 }, { "epoch": 2.0994708994708993, "grad_norm": 0.2366397155580664, "learning_rate": 1.669933359466876e-05, "loss": 0.3539, "step": 1984 }, { "epoch": 2.1005291005291005, "grad_norm": 0.2178711255251422, "learning_rate": 1.6679733437867505e-05, "loss": 0.329, "step": 1985 }, { "epoch": 2.1015873015873017, "grad_norm": 0.21131026834720637, "learning_rate": 1.6660133281066247e-05, "loss": 0.3361, "step": 1986 }, { "epoch": 2.102645502645503, "grad_norm": 0.2055578261425351, "learning_rate": 1.6640533124264996e-05, "loss": 0.3032, "step": 1987 }, { "epoch": 2.1037037037037036, "grad_norm": 0.21812782402014477, "learning_rate": 1.662093296746374e-05, "loss": 0.296, "step": 1988 }, { "epoch": 2.104761904761905, "grad_norm": 0.21287530507237762, "learning_rate": 1.6601332810662488e-05, "loss": 0.351, "step": 1989 }, { "epoch": 2.105820105820106, "grad_norm": 0.20950973871134726, "learning_rate": 1.658173265386123e-05, "loss": 0.3575, "step": 1990 }, { "epoch": 2.106878306878307, "grad_norm": 0.21396955753725935, "learning_rate": 1.6562132497059977e-05, "loss": 0.3424, "step": 1991 }, { "epoch": 2.107936507936508, "grad_norm": 0.22616878855251082, "learning_rate": 1.6542532340258722e-05, "loss": 0.3662, "step": 1992 }, { "epoch": 2.108994708994709, "grad_norm": 0.20008820458876664, "learning_rate": 1.6522932183457468e-05, "loss": 0.3108, "step": 1993 }, { "epoch": 2.11005291005291, "grad_norm": 4.143863598055369, "learning_rate": 1.6503332026656214e-05, "loss": 0.4833, "step": 1994 }, { "epoch": 2.111111111111111, "grad_norm": 0.2292235575997881, "learning_rate": 1.648373186985496e-05, "loss": 0.3343, "step": 1995 }, { "epoch": 2.1121693121693124, "grad_norm": 0.23419107839319792, "learning_rate": 1.6464131713053706e-05, "loss": 0.3347, "step": 1996 }, { "epoch": 2.113227513227513, "grad_norm": 0.22940581819547085, "learning_rate": 1.6444531556252452e-05, "loss": 0.3951, "step": 1997 }, { "epoch": 2.1142857142857143, "grad_norm": 0.23526414274011145, "learning_rate": 1.6424931399451198e-05, "loss": 0.3424, "step": 1998 }, { "epoch": 2.1153439153439155, "grad_norm": 0.22735375376942824, "learning_rate": 1.6405331242649944e-05, "loss": 0.3649, "step": 1999 }, { "epoch": 2.1164021164021163, "grad_norm": 0.21944975202575878, "learning_rate": 1.638573108584869e-05, "loss": 0.3376, "step": 2000 }, { "epoch": 2.1174603174603175, "grad_norm": 0.5029143453384859, "learning_rate": 1.6366130929047432e-05, "loss": 0.3368, "step": 2001 }, { "epoch": 2.1185185185185187, "grad_norm": 0.2504647540949935, "learning_rate": 1.634653077224618e-05, "loss": 0.365, "step": 2002 }, { "epoch": 2.1195767195767194, "grad_norm": 0.2468901550955016, "learning_rate": 1.6326930615444924e-05, "loss": 0.3448, "step": 2003 }, { "epoch": 2.1206349206349207, "grad_norm": 0.2197505442465071, "learning_rate": 1.630733045864367e-05, "loss": 0.361, "step": 2004 }, { "epoch": 2.121693121693122, "grad_norm": 0.25391417814496214, "learning_rate": 1.6287730301842415e-05, "loss": 0.3581, "step": 2005 }, { "epoch": 2.1227513227513226, "grad_norm": 0.25883936021880843, "learning_rate": 1.626813014504116e-05, "loss": 0.3505, "step": 2006 }, { "epoch": 2.123809523809524, "grad_norm": 0.2520258145131729, "learning_rate": 1.6248529988239907e-05, "loss": 0.3398, "step": 2007 }, { "epoch": 2.124867724867725, "grad_norm": 0.2203661316211868, "learning_rate": 1.6228929831438653e-05, "loss": 0.3627, "step": 2008 }, { "epoch": 2.1259259259259258, "grad_norm": 0.23435328040308137, "learning_rate": 1.6209329674637396e-05, "loss": 0.3389, "step": 2009 }, { "epoch": 2.126984126984127, "grad_norm": 0.27743951042442594, "learning_rate": 1.6189729517836145e-05, "loss": 0.3761, "step": 2010 }, { "epoch": 2.128042328042328, "grad_norm": 0.20244205174436367, "learning_rate": 1.6170129361034887e-05, "loss": 0.3152, "step": 2011 }, { "epoch": 2.129100529100529, "grad_norm": 0.20894323608250834, "learning_rate": 1.6150529204233637e-05, "loss": 0.3854, "step": 2012 }, { "epoch": 2.13015873015873, "grad_norm": 0.24428589140845183, "learning_rate": 1.613092904743238e-05, "loss": 0.3876, "step": 2013 }, { "epoch": 2.1312169312169313, "grad_norm": 0.24322774885752654, "learning_rate": 1.6111328890631125e-05, "loss": 0.3581, "step": 2014 }, { "epoch": 2.132275132275132, "grad_norm": 0.20361725365232156, "learning_rate": 1.609172873382987e-05, "loss": 0.288, "step": 2015 }, { "epoch": 2.1333333333333333, "grad_norm": 0.23886166021463448, "learning_rate": 1.6072128577028617e-05, "loss": 0.3867, "step": 2016 }, { "epoch": 2.1343915343915345, "grad_norm": 0.20682128386588214, "learning_rate": 1.6052528420227363e-05, "loss": 0.3289, "step": 2017 }, { "epoch": 2.1354497354497353, "grad_norm": 0.22555396551236676, "learning_rate": 1.603292826342611e-05, "loss": 0.336, "step": 2018 }, { "epoch": 2.1365079365079365, "grad_norm": 0.22141671607674948, "learning_rate": 1.6013328106624854e-05, "loss": 0.3584, "step": 2019 }, { "epoch": 2.1375661375661377, "grad_norm": 0.21896453846877967, "learning_rate": 1.59937279498236e-05, "loss": 0.3527, "step": 2020 }, { "epoch": 2.1386243386243384, "grad_norm": 0.22228976521690386, "learning_rate": 1.5974127793022346e-05, "loss": 0.3923, "step": 2021 }, { "epoch": 2.1396825396825396, "grad_norm": 0.22066786689884066, "learning_rate": 1.595452763622109e-05, "loss": 0.3625, "step": 2022 }, { "epoch": 2.140740740740741, "grad_norm": 0.22383398867863769, "learning_rate": 1.5934927479419838e-05, "loss": 0.3741, "step": 2023 }, { "epoch": 2.1417989417989416, "grad_norm": 0.21926212085215122, "learning_rate": 1.591532732261858e-05, "loss": 0.305, "step": 2024 }, { "epoch": 2.142857142857143, "grad_norm": 0.21328408231111298, "learning_rate": 1.589572716581733e-05, "loss": 0.3539, "step": 2025 }, { "epoch": 2.143915343915344, "grad_norm": 0.20310210789488775, "learning_rate": 1.5876127009016072e-05, "loss": 0.3406, "step": 2026 }, { "epoch": 2.1449735449735448, "grad_norm": 0.20985690382441458, "learning_rate": 1.5856526852214818e-05, "loss": 0.3298, "step": 2027 }, { "epoch": 2.146031746031746, "grad_norm": 0.21426037653049163, "learning_rate": 1.5836926695413564e-05, "loss": 0.3076, "step": 2028 }, { "epoch": 2.147089947089947, "grad_norm": 0.21677917505372724, "learning_rate": 1.581732653861231e-05, "loss": 0.3824, "step": 2029 }, { "epoch": 2.148148148148148, "grad_norm": 0.2064869233207918, "learning_rate": 1.5797726381811055e-05, "loss": 0.3436, "step": 2030 }, { "epoch": 2.149206349206349, "grad_norm": 0.21971899343914275, "learning_rate": 1.57781262250098e-05, "loss": 0.369, "step": 2031 }, { "epoch": 2.1502645502645503, "grad_norm": 0.3221845152874034, "learning_rate": 1.5758526068208547e-05, "loss": 0.3425, "step": 2032 }, { "epoch": 2.1513227513227515, "grad_norm": 0.19874490771824377, "learning_rate": 1.5738925911407293e-05, "loss": 0.3184, "step": 2033 }, { "epoch": 2.1523809523809523, "grad_norm": 10.905047393025432, "learning_rate": 1.571932575460604e-05, "loss": 1.0908, "step": 2034 }, { "epoch": 2.1534391534391535, "grad_norm": 0.21735676542498997, "learning_rate": 1.569972559780478e-05, "loss": 0.3473, "step": 2035 }, { "epoch": 2.1544973544973547, "grad_norm": 0.216545692844694, "learning_rate": 1.568012544100353e-05, "loss": 0.3582, "step": 2036 }, { "epoch": 2.1555555555555554, "grad_norm": 0.21730583612765947, "learning_rate": 1.5660525284202273e-05, "loss": 0.3126, "step": 2037 }, { "epoch": 2.1566137566137566, "grad_norm": 0.2004424058198479, "learning_rate": 1.5640925127401022e-05, "loss": 0.34, "step": 2038 }, { "epoch": 2.157671957671958, "grad_norm": 0.2118306305932416, "learning_rate": 1.5621324970599765e-05, "loss": 0.307, "step": 2039 }, { "epoch": 2.1587301587301586, "grad_norm": 0.22105484159810915, "learning_rate": 1.560172481379851e-05, "loss": 0.3747, "step": 2040 }, { "epoch": 2.15978835978836, "grad_norm": 0.20960788510999773, "learning_rate": 1.5582124656997257e-05, "loss": 0.3417, "step": 2041 }, { "epoch": 2.160846560846561, "grad_norm": 0.21884953306007163, "learning_rate": 1.5562524500196003e-05, "loss": 0.3512, "step": 2042 }, { "epoch": 2.1619047619047618, "grad_norm": 0.23243491580846684, "learning_rate": 1.554292434339475e-05, "loss": 0.3841, "step": 2043 }, { "epoch": 2.162962962962963, "grad_norm": 0.23140426219629298, "learning_rate": 1.5523324186593494e-05, "loss": 0.405, "step": 2044 }, { "epoch": 2.164021164021164, "grad_norm": 0.20298786257942408, "learning_rate": 1.5503724029792237e-05, "loss": 0.3551, "step": 2045 }, { "epoch": 2.165079365079365, "grad_norm": 0.21348196250544565, "learning_rate": 1.5484123872990986e-05, "loss": 0.3378, "step": 2046 }, { "epoch": 2.166137566137566, "grad_norm": 0.2439977018458661, "learning_rate": 1.546452371618973e-05, "loss": 0.3773, "step": 2047 }, { "epoch": 2.1671957671957673, "grad_norm": 0.19895306399691473, "learning_rate": 1.5444923559388474e-05, "loss": 0.3209, "step": 2048 }, { "epoch": 2.168253968253968, "grad_norm": 0.21695598379114167, "learning_rate": 1.542532340258722e-05, "loss": 0.3675, "step": 2049 }, { "epoch": 2.1693121693121693, "grad_norm": 0.1794392725994556, "learning_rate": 1.5405723245785966e-05, "loss": 0.2931, "step": 2050 }, { "epoch": 2.1703703703703705, "grad_norm": 0.2038700225689888, "learning_rate": 1.5386123088984712e-05, "loss": 0.3375, "step": 2051 }, { "epoch": 2.1714285714285713, "grad_norm": 0.22597709606412955, "learning_rate": 1.5366522932183458e-05, "loss": 0.3576, "step": 2052 }, { "epoch": 2.1724867724867725, "grad_norm": 0.2077246480006038, "learning_rate": 1.5346922775382204e-05, "loss": 0.3344, "step": 2053 }, { "epoch": 2.1735449735449737, "grad_norm": 0.20000941756704047, "learning_rate": 1.532732261858095e-05, "loss": 0.3382, "step": 2054 }, { "epoch": 2.1746031746031744, "grad_norm": 0.19713016775489853, "learning_rate": 1.5307722461779696e-05, "loss": 0.3333, "step": 2055 }, { "epoch": 2.1756613756613756, "grad_norm": 0.19246485509834052, "learning_rate": 1.528812230497844e-05, "loss": 0.305, "step": 2056 }, { "epoch": 2.176719576719577, "grad_norm": 0.20627954988108957, "learning_rate": 1.5268522148177187e-05, "loss": 0.3102, "step": 2057 }, { "epoch": 2.1777777777777776, "grad_norm": 0.21828606128129674, "learning_rate": 1.5248921991375931e-05, "loss": 0.38, "step": 2058 }, { "epoch": 2.178835978835979, "grad_norm": 0.20648370010623027, "learning_rate": 1.5229321834574677e-05, "loss": 0.3356, "step": 2059 }, { "epoch": 2.17989417989418, "grad_norm": 0.21130639938010654, "learning_rate": 1.5209721677773423e-05, "loss": 0.3638, "step": 2060 }, { "epoch": 2.1809523809523808, "grad_norm": 0.2115670503852573, "learning_rate": 1.5190121520972169e-05, "loss": 0.3432, "step": 2061 }, { "epoch": 2.182010582010582, "grad_norm": 0.1908436166806636, "learning_rate": 1.5170521364170915e-05, "loss": 0.3041, "step": 2062 }, { "epoch": 2.183068783068783, "grad_norm": 0.2020564072365534, "learning_rate": 1.515092120736966e-05, "loss": 0.336, "step": 2063 }, { "epoch": 2.1841269841269844, "grad_norm": 0.2161693720721347, "learning_rate": 1.5131321050568407e-05, "loss": 0.3545, "step": 2064 }, { "epoch": 2.185185185185185, "grad_norm": 0.22473602950532073, "learning_rate": 1.5111720893767151e-05, "loss": 0.342, "step": 2065 }, { "epoch": 2.1862433862433863, "grad_norm": 0.2150375377284277, "learning_rate": 1.5092120736965895e-05, "loss": 0.3833, "step": 2066 }, { "epoch": 2.1873015873015875, "grad_norm": 0.21870865973475231, "learning_rate": 1.5072520580164643e-05, "loss": 0.3534, "step": 2067 }, { "epoch": 2.1883597883597883, "grad_norm": 0.21446734495632633, "learning_rate": 1.5052920423363387e-05, "loss": 0.3606, "step": 2068 }, { "epoch": 2.1894179894179895, "grad_norm": 0.22888453880818366, "learning_rate": 1.5033320266562134e-05, "loss": 0.3907, "step": 2069 }, { "epoch": 2.1904761904761907, "grad_norm": 0.21937778969235877, "learning_rate": 1.5013720109760879e-05, "loss": 0.3293, "step": 2070 }, { "epoch": 2.1915343915343914, "grad_norm": 0.20058436964581325, "learning_rate": 1.4994119952959623e-05, "loss": 0.3137, "step": 2071 }, { "epoch": 2.1925925925925926, "grad_norm": 0.23325324698372354, "learning_rate": 1.497451979615837e-05, "loss": 0.3512, "step": 2072 }, { "epoch": 2.193650793650794, "grad_norm": 0.21063385449656516, "learning_rate": 1.4954919639357115e-05, "loss": 0.3487, "step": 2073 }, { "epoch": 2.1947089947089946, "grad_norm": 0.22182976034778512, "learning_rate": 1.4935319482555862e-05, "loss": 0.3783, "step": 2074 }, { "epoch": 2.195767195767196, "grad_norm": 0.20256863181044094, "learning_rate": 1.4915719325754606e-05, "loss": 0.3231, "step": 2075 }, { "epoch": 2.196825396825397, "grad_norm": 0.23952493244674197, "learning_rate": 1.4896119168953352e-05, "loss": 0.3828, "step": 2076 }, { "epoch": 2.1978835978835978, "grad_norm": 0.2161109491007153, "learning_rate": 1.4876519012152098e-05, "loss": 0.3379, "step": 2077 }, { "epoch": 2.198941798941799, "grad_norm": 0.21208428593234505, "learning_rate": 1.4856918855350844e-05, "loss": 0.3386, "step": 2078 }, { "epoch": 2.2, "grad_norm": 0.20469783702775474, "learning_rate": 1.483731869854959e-05, "loss": 0.3285, "step": 2079 }, { "epoch": 2.201058201058201, "grad_norm": 0.20106413317496333, "learning_rate": 1.4817718541748336e-05, "loss": 0.3191, "step": 2080 }, { "epoch": 2.202116402116402, "grad_norm": 0.20886186796806508, "learning_rate": 1.479811838494708e-05, "loss": 0.3638, "step": 2081 }, { "epoch": 2.2031746031746033, "grad_norm": 0.2024596948339384, "learning_rate": 1.4778518228145827e-05, "loss": 0.3497, "step": 2082 }, { "epoch": 2.204232804232804, "grad_norm": 0.19966325145216773, "learning_rate": 1.4758918071344572e-05, "loss": 0.3598, "step": 2083 }, { "epoch": 2.2052910052910053, "grad_norm": 0.19469952860860135, "learning_rate": 1.4739317914543316e-05, "loss": 0.3035, "step": 2084 }, { "epoch": 2.2063492063492065, "grad_norm": 0.21170064193002702, "learning_rate": 1.4719717757742063e-05, "loss": 0.304, "step": 2085 }, { "epoch": 2.2074074074074073, "grad_norm": 0.20805361694025837, "learning_rate": 1.4700117600940808e-05, "loss": 0.3511, "step": 2086 }, { "epoch": 2.2084656084656085, "grad_norm": 0.2055245437744949, "learning_rate": 1.4680517444139555e-05, "loss": 0.3544, "step": 2087 }, { "epoch": 2.2095238095238097, "grad_norm": 0.201237038494924, "learning_rate": 1.46609172873383e-05, "loss": 0.3253, "step": 2088 }, { "epoch": 2.2105820105820104, "grad_norm": 0.22354747868539224, "learning_rate": 1.4641317130537043e-05, "loss": 0.3795, "step": 2089 }, { "epoch": 2.2116402116402116, "grad_norm": 0.22190512018803507, "learning_rate": 1.4621716973735791e-05, "loss": 0.3626, "step": 2090 }, { "epoch": 2.212698412698413, "grad_norm": 0.3024061933530375, "learning_rate": 1.4602116816934535e-05, "loss": 0.3426, "step": 2091 }, { "epoch": 2.2137566137566136, "grad_norm": 0.22362434974122583, "learning_rate": 1.4582516660133283e-05, "loss": 0.3568, "step": 2092 }, { "epoch": 2.214814814814815, "grad_norm": 0.2120000539713426, "learning_rate": 1.4562916503332027e-05, "loss": 0.3436, "step": 2093 }, { "epoch": 2.215873015873016, "grad_norm": 0.2273394265772914, "learning_rate": 1.4543316346530773e-05, "loss": 0.347, "step": 2094 }, { "epoch": 2.2169312169312168, "grad_norm": 0.20044541135515623, "learning_rate": 1.4523716189729519e-05, "loss": 0.3415, "step": 2095 }, { "epoch": 2.217989417989418, "grad_norm": 0.20384591070530153, "learning_rate": 1.4504116032928265e-05, "loss": 0.3278, "step": 2096 }, { "epoch": 2.219047619047619, "grad_norm": 0.25810597441442096, "learning_rate": 1.4484515876127009e-05, "loss": 0.3534, "step": 2097 }, { "epoch": 2.22010582010582, "grad_norm": 0.22795352736979752, "learning_rate": 1.4464915719325756e-05, "loss": 0.3455, "step": 2098 }, { "epoch": 2.221164021164021, "grad_norm": 0.21049254714677118, "learning_rate": 1.44453155625245e-05, "loss": 0.3808, "step": 2099 }, { "epoch": 2.2222222222222223, "grad_norm": 0.23067771453190306, "learning_rate": 1.4425715405723248e-05, "loss": 0.3285, "step": 2100 }, { "epoch": 2.223280423280423, "grad_norm": 0.2200823890048532, "learning_rate": 1.4406115248921992e-05, "loss": 0.3497, "step": 2101 }, { "epoch": 2.2243386243386243, "grad_norm": 3.4221269378218153, "learning_rate": 1.4386515092120736e-05, "loss": 0.4657, "step": 2102 }, { "epoch": 2.2253968253968255, "grad_norm": 0.23261505421987935, "learning_rate": 1.4366914935319484e-05, "loss": 0.3376, "step": 2103 }, { "epoch": 2.2264550264550262, "grad_norm": 0.20182994591620118, "learning_rate": 1.4347314778518228e-05, "loss": 0.3526, "step": 2104 }, { "epoch": 2.2275132275132274, "grad_norm": 0.21082285581411647, "learning_rate": 1.4327714621716976e-05, "loss": 0.3552, "step": 2105 }, { "epoch": 2.2285714285714286, "grad_norm": 0.22032489334949037, "learning_rate": 1.430811446491572e-05, "loss": 0.3756, "step": 2106 }, { "epoch": 2.2296296296296294, "grad_norm": 0.2159520496931439, "learning_rate": 1.4288514308114464e-05, "loss": 0.3592, "step": 2107 }, { "epoch": 2.2306878306878306, "grad_norm": 0.20689680272282845, "learning_rate": 1.4268914151313212e-05, "loss": 0.3491, "step": 2108 }, { "epoch": 2.231746031746032, "grad_norm": 0.19169011186080528, "learning_rate": 1.4249313994511956e-05, "loss": 0.3355, "step": 2109 }, { "epoch": 2.2328042328042326, "grad_norm": 0.19803274438625507, "learning_rate": 1.4229713837710703e-05, "loss": 0.3211, "step": 2110 }, { "epoch": 2.2338624338624338, "grad_norm": 0.2169935695452419, "learning_rate": 1.4210113680909448e-05, "loss": 0.3789, "step": 2111 }, { "epoch": 2.234920634920635, "grad_norm": 0.20431499248719218, "learning_rate": 1.4190513524108193e-05, "loss": 0.3162, "step": 2112 }, { "epoch": 2.235978835978836, "grad_norm": 0.2079250549519578, "learning_rate": 1.417091336730694e-05, "loss": 0.3469, "step": 2113 }, { "epoch": 2.237037037037037, "grad_norm": 0.22941584714468002, "learning_rate": 1.4151313210505685e-05, "loss": 0.405, "step": 2114 }, { "epoch": 2.238095238095238, "grad_norm": 0.19630828002495715, "learning_rate": 1.413171305370443e-05, "loss": 0.3515, "step": 2115 }, { "epoch": 2.2391534391534393, "grad_norm": 0.210404649350759, "learning_rate": 1.4112112896903177e-05, "loss": 0.3376, "step": 2116 }, { "epoch": 2.24021164021164, "grad_norm": 0.2376328983905229, "learning_rate": 1.4092512740101921e-05, "loss": 0.388, "step": 2117 }, { "epoch": 2.2412698412698413, "grad_norm": 0.21383169199679727, "learning_rate": 1.4072912583300669e-05, "loss": 0.3199, "step": 2118 }, { "epoch": 2.2423280423280425, "grad_norm": 0.20202389531154502, "learning_rate": 1.4053312426499413e-05, "loss": 0.3684, "step": 2119 }, { "epoch": 2.2433862433862433, "grad_norm": 0.32250759227392084, "learning_rate": 1.4033712269698157e-05, "loss": 0.3633, "step": 2120 }, { "epoch": 2.2444444444444445, "grad_norm": 0.23290731003562973, "learning_rate": 1.4014112112896905e-05, "loss": 0.4027, "step": 2121 }, { "epoch": 2.2455026455026457, "grad_norm": 0.2116123949601418, "learning_rate": 1.3994511956095649e-05, "loss": 0.3819, "step": 2122 }, { "epoch": 2.2465608465608464, "grad_norm": 0.21423684536621376, "learning_rate": 1.3974911799294396e-05, "loss": 0.3684, "step": 2123 }, { "epoch": 2.2476190476190476, "grad_norm": 0.2052592452761068, "learning_rate": 1.395531164249314e-05, "loss": 0.3344, "step": 2124 }, { "epoch": 2.248677248677249, "grad_norm": 0.20079823755779902, "learning_rate": 1.3935711485691885e-05, "loss": 0.3618, "step": 2125 }, { "epoch": 2.2497354497354496, "grad_norm": 1.0894959314523671, "learning_rate": 1.3916111328890632e-05, "loss": 0.3475, "step": 2126 }, { "epoch": 2.250793650793651, "grad_norm": 0.2099112875651436, "learning_rate": 1.3896511172089377e-05, "loss": 0.3252, "step": 2127 }, { "epoch": 2.251851851851852, "grad_norm": 0.20136693529793254, "learning_rate": 1.3876911015288124e-05, "loss": 0.3378, "step": 2128 }, { "epoch": 2.2529100529100528, "grad_norm": 0.2105920879605314, "learning_rate": 1.3857310858486868e-05, "loss": 0.3455, "step": 2129 }, { "epoch": 2.253968253968254, "grad_norm": 0.21728105108056908, "learning_rate": 1.3837710701685614e-05, "loss": 0.3555, "step": 2130 }, { "epoch": 2.255026455026455, "grad_norm": 0.19483145128816542, "learning_rate": 1.381811054488436e-05, "loss": 0.3163, "step": 2131 }, { "epoch": 2.256084656084656, "grad_norm": 0.18761723281777778, "learning_rate": 1.3798510388083106e-05, "loss": 0.3057, "step": 2132 }, { "epoch": 2.257142857142857, "grad_norm": 0.2031218040229401, "learning_rate": 1.377891023128185e-05, "loss": 0.3118, "step": 2133 }, { "epoch": 2.2582010582010583, "grad_norm": 0.19461075746598536, "learning_rate": 1.3759310074480598e-05, "loss": 0.3307, "step": 2134 }, { "epoch": 2.259259259259259, "grad_norm": 0.23054743512215764, "learning_rate": 1.3739709917679342e-05, "loss": 0.4172, "step": 2135 }, { "epoch": 2.2603174603174603, "grad_norm": 0.21187621751878205, "learning_rate": 1.372010976087809e-05, "loss": 0.3666, "step": 2136 }, { "epoch": 2.2613756613756615, "grad_norm": 0.22640129012026444, "learning_rate": 1.3700509604076834e-05, "loss": 0.3875, "step": 2137 }, { "epoch": 2.2624338624338622, "grad_norm": 0.19823862095773512, "learning_rate": 1.3680909447275578e-05, "loss": 0.2947, "step": 2138 }, { "epoch": 2.2634920634920634, "grad_norm": 0.21825276904693997, "learning_rate": 1.3661309290474325e-05, "loss": 0.3833, "step": 2139 }, { "epoch": 2.2645502645502646, "grad_norm": 0.21409936187628198, "learning_rate": 1.364170913367307e-05, "loss": 0.3485, "step": 2140 }, { "epoch": 2.265608465608466, "grad_norm": 0.19696631739592757, "learning_rate": 1.3622108976871817e-05, "loss": 0.325, "step": 2141 }, { "epoch": 2.2666666666666666, "grad_norm": 0.18972546473558874, "learning_rate": 1.3602508820070561e-05, "loss": 0.3315, "step": 2142 }, { "epoch": 2.267724867724868, "grad_norm": 0.20576822255139696, "learning_rate": 1.3582908663269305e-05, "loss": 0.3432, "step": 2143 }, { "epoch": 2.268783068783069, "grad_norm": 1.5364918959173623, "learning_rate": 1.3563308506468053e-05, "loss": 0.3966, "step": 2144 }, { "epoch": 2.2698412698412698, "grad_norm": 0.21478874797886527, "learning_rate": 1.3543708349666797e-05, "loss": 0.3491, "step": 2145 }, { "epoch": 2.270899470899471, "grad_norm": 0.21276572642119787, "learning_rate": 1.3524108192865541e-05, "loss": 0.3825, "step": 2146 }, { "epoch": 2.271957671957672, "grad_norm": 0.1955401249488189, "learning_rate": 1.3504508036064289e-05, "loss": 0.3066, "step": 2147 }, { "epoch": 2.273015873015873, "grad_norm": 0.19895969650589623, "learning_rate": 1.3484907879263035e-05, "loss": 0.3479, "step": 2148 }, { "epoch": 2.274074074074074, "grad_norm": 0.21571935832308445, "learning_rate": 1.346530772246178e-05, "loss": 0.3688, "step": 2149 }, { "epoch": 2.2751322751322753, "grad_norm": 0.21735528938360044, "learning_rate": 1.3445707565660527e-05, "loss": 0.3332, "step": 2150 }, { "epoch": 2.276190476190476, "grad_norm": 0.19334954892944037, "learning_rate": 1.342610740885927e-05, "loss": 0.313, "step": 2151 }, { "epoch": 2.2772486772486773, "grad_norm": 0.20885026871074683, "learning_rate": 1.3406507252058018e-05, "loss": 0.3296, "step": 2152 }, { "epoch": 2.2783068783068785, "grad_norm": 0.1976772642773375, "learning_rate": 1.3386907095256762e-05, "loss": 0.3354, "step": 2153 }, { "epoch": 2.2793650793650793, "grad_norm": 0.21922541090556855, "learning_rate": 1.336730693845551e-05, "loss": 0.3702, "step": 2154 }, { "epoch": 2.2804232804232805, "grad_norm": 0.22298245670974845, "learning_rate": 1.3347706781654254e-05, "loss": 0.3339, "step": 2155 }, { "epoch": 2.2814814814814817, "grad_norm": 0.21558822449093906, "learning_rate": 1.3328106624852998e-05, "loss": 0.3699, "step": 2156 }, { "epoch": 2.2825396825396824, "grad_norm": 0.20618573290980355, "learning_rate": 1.3308506468051746e-05, "loss": 0.3411, "step": 2157 }, { "epoch": 2.2835978835978836, "grad_norm": 0.22181612653063038, "learning_rate": 1.328890631125049e-05, "loss": 0.3479, "step": 2158 }, { "epoch": 2.284656084656085, "grad_norm": 0.2033737787576765, "learning_rate": 1.3269306154449238e-05, "loss": 0.309, "step": 2159 }, { "epoch": 2.2857142857142856, "grad_norm": 0.2304280611065484, "learning_rate": 1.3249705997647982e-05, "loss": 0.3606, "step": 2160 }, { "epoch": 2.286772486772487, "grad_norm": 0.21105846127089964, "learning_rate": 1.3230105840846726e-05, "loss": 0.3343, "step": 2161 }, { "epoch": 2.287830687830688, "grad_norm": 0.21136860640342398, "learning_rate": 1.3210505684045474e-05, "loss": 0.3407, "step": 2162 }, { "epoch": 2.2888888888888888, "grad_norm": 0.21010473995437073, "learning_rate": 1.3190905527244218e-05, "loss": 0.3338, "step": 2163 }, { "epoch": 2.28994708994709, "grad_norm": 0.21899937987852755, "learning_rate": 1.3171305370442964e-05, "loss": 0.3486, "step": 2164 }, { "epoch": 2.291005291005291, "grad_norm": 0.1975690727274802, "learning_rate": 1.315170521364171e-05, "loss": 0.3115, "step": 2165 }, { "epoch": 2.292063492063492, "grad_norm": 0.19198092778024467, "learning_rate": 1.3132105056840455e-05, "loss": 0.3275, "step": 2166 }, { "epoch": 2.293121693121693, "grad_norm": 0.19304872180177207, "learning_rate": 1.3112504900039201e-05, "loss": 0.3058, "step": 2167 }, { "epoch": 2.2941798941798943, "grad_norm": 0.22184189861680062, "learning_rate": 1.3092904743237947e-05, "loss": 0.3557, "step": 2168 }, { "epoch": 2.295238095238095, "grad_norm": 0.2044727357753798, "learning_rate": 1.3073304586436691e-05, "loss": 0.3775, "step": 2169 }, { "epoch": 2.2962962962962963, "grad_norm": 0.2109940769725802, "learning_rate": 1.3053704429635439e-05, "loss": 0.3606, "step": 2170 }, { "epoch": 2.2973544973544975, "grad_norm": 0.20603745974637752, "learning_rate": 1.3034104272834183e-05, "loss": 0.3568, "step": 2171 }, { "epoch": 2.2984126984126982, "grad_norm": 0.20723859027326352, "learning_rate": 1.301450411603293e-05, "loss": 0.3598, "step": 2172 }, { "epoch": 2.2994708994708994, "grad_norm": 0.22014955361649102, "learning_rate": 1.2994903959231675e-05, "loss": 0.3645, "step": 2173 }, { "epoch": 2.3005291005291006, "grad_norm": 0.21240912935645942, "learning_rate": 1.2975303802430419e-05, "loss": 0.3227, "step": 2174 }, { "epoch": 2.3015873015873014, "grad_norm": 0.1984055417276385, "learning_rate": 1.2955703645629167e-05, "loss": 0.3403, "step": 2175 }, { "epoch": 2.3026455026455026, "grad_norm": 0.20615107635995517, "learning_rate": 1.293610348882791e-05, "loss": 0.357, "step": 2176 }, { "epoch": 2.303703703703704, "grad_norm": 0.2883475072855802, "learning_rate": 1.2916503332026658e-05, "loss": 0.328, "step": 2177 }, { "epoch": 2.3047619047619046, "grad_norm": 0.21617403311285718, "learning_rate": 1.2896903175225403e-05, "loss": 0.3207, "step": 2178 }, { "epoch": 2.3058201058201058, "grad_norm": 0.22261165487391843, "learning_rate": 1.2877303018424147e-05, "loss": 0.3905, "step": 2179 }, { "epoch": 2.306878306878307, "grad_norm": 0.19746052299888087, "learning_rate": 1.2857702861622894e-05, "loss": 0.3344, "step": 2180 }, { "epoch": 2.3079365079365077, "grad_norm": 0.20619793904799188, "learning_rate": 1.2838102704821638e-05, "loss": 0.3187, "step": 2181 }, { "epoch": 2.308994708994709, "grad_norm": 0.21727096923527425, "learning_rate": 1.2818502548020384e-05, "loss": 0.393, "step": 2182 }, { "epoch": 2.31005291005291, "grad_norm": 0.22496945697900078, "learning_rate": 1.279890239121913e-05, "loss": 0.4016, "step": 2183 }, { "epoch": 2.311111111111111, "grad_norm": 0.20878049645676627, "learning_rate": 1.2779302234417876e-05, "loss": 0.3308, "step": 2184 }, { "epoch": 2.312169312169312, "grad_norm": 0.19876898454737266, "learning_rate": 1.2759702077616622e-05, "loss": 0.3472, "step": 2185 }, { "epoch": 2.3132275132275133, "grad_norm": 0.22470198675510053, "learning_rate": 1.2740101920815368e-05, "loss": 0.3723, "step": 2186 }, { "epoch": 2.314285714285714, "grad_norm": 0.2149186208399833, "learning_rate": 1.2720501764014112e-05, "loss": 0.3609, "step": 2187 }, { "epoch": 2.3153439153439153, "grad_norm": 0.21505615763181798, "learning_rate": 1.270090160721286e-05, "loss": 0.3775, "step": 2188 }, { "epoch": 2.3164021164021165, "grad_norm": 0.21734973914888298, "learning_rate": 1.2681301450411604e-05, "loss": 0.3578, "step": 2189 }, { "epoch": 2.317460317460317, "grad_norm": 0.19738584826580072, "learning_rate": 1.2661701293610351e-05, "loss": 0.3114, "step": 2190 }, { "epoch": 2.3185185185185184, "grad_norm": 0.2059651241318454, "learning_rate": 1.2642101136809096e-05, "loss": 0.3503, "step": 2191 }, { "epoch": 2.3195767195767196, "grad_norm": 0.21191185476329366, "learning_rate": 1.262250098000784e-05, "loss": 0.3277, "step": 2192 }, { "epoch": 2.320634920634921, "grad_norm": 0.21822223673790192, "learning_rate": 1.2602900823206587e-05, "loss": 0.3402, "step": 2193 }, { "epoch": 2.3216931216931216, "grad_norm": 0.22119523492818854, "learning_rate": 1.2583300666405331e-05, "loss": 0.3931, "step": 2194 }, { "epoch": 2.322751322751323, "grad_norm": 0.18136041095386, "learning_rate": 1.2563700509604076e-05, "loss": 0.3065, "step": 2195 }, { "epoch": 2.323809523809524, "grad_norm": 0.28983751655981344, "learning_rate": 1.2544100352802823e-05, "loss": 0.366, "step": 2196 }, { "epoch": 2.3248677248677247, "grad_norm": 0.2191979489235065, "learning_rate": 1.2524500196001567e-05, "loss": 0.3444, "step": 2197 }, { "epoch": 2.325925925925926, "grad_norm": 0.200384296867351, "learning_rate": 1.2504900039200315e-05, "loss": 0.3277, "step": 2198 }, { "epoch": 2.326984126984127, "grad_norm": 0.20207450606748373, "learning_rate": 1.248529988239906e-05, "loss": 0.3619, "step": 2199 }, { "epoch": 2.328042328042328, "grad_norm": 0.20817037763377247, "learning_rate": 1.2465699725597805e-05, "loss": 0.349, "step": 2200 }, { "epoch": 2.329100529100529, "grad_norm": 0.20772850458509592, "learning_rate": 1.2446099568796551e-05, "loss": 0.3289, "step": 2201 }, { "epoch": 2.3301587301587303, "grad_norm": 1.2036917950362793, "learning_rate": 1.2426499411995297e-05, "loss": 0.4814, "step": 2202 }, { "epoch": 2.331216931216931, "grad_norm": 0.21881788351835138, "learning_rate": 1.2406899255194043e-05, "loss": 0.3625, "step": 2203 }, { "epoch": 2.3322751322751323, "grad_norm": 0.20154036096455355, "learning_rate": 1.2387299098392789e-05, "loss": 0.3686, "step": 2204 }, { "epoch": 2.3333333333333335, "grad_norm": 0.20121002730380697, "learning_rate": 1.2367698941591534e-05, "loss": 0.3603, "step": 2205 }, { "epoch": 2.3343915343915342, "grad_norm": 0.2197435442897782, "learning_rate": 1.234809878479028e-05, "loss": 0.349, "step": 2206 }, { "epoch": 2.3354497354497354, "grad_norm": 0.21748291890475052, "learning_rate": 1.2328498627989024e-05, "loss": 0.4108, "step": 2207 }, { "epoch": 2.3365079365079366, "grad_norm": 0.18641463759702262, "learning_rate": 1.230889847118777e-05, "loss": 0.31, "step": 2208 }, { "epoch": 2.3375661375661374, "grad_norm": 0.19503644892500818, "learning_rate": 1.2289298314386516e-05, "loss": 0.3595, "step": 2209 }, { "epoch": 2.3386243386243386, "grad_norm": 0.20574304094175103, "learning_rate": 1.2269698157585262e-05, "loss": 0.3666, "step": 2210 }, { "epoch": 2.33968253968254, "grad_norm": 0.20069061348310832, "learning_rate": 1.2250098000784006e-05, "loss": 0.3123, "step": 2211 }, { "epoch": 2.3407407407407406, "grad_norm": 0.20015298378799415, "learning_rate": 1.2230497843982752e-05, "loss": 0.3545, "step": 2212 }, { "epoch": 2.3417989417989418, "grad_norm": 0.19565316573476668, "learning_rate": 1.2210897687181498e-05, "loss": 0.3481, "step": 2213 }, { "epoch": 2.342857142857143, "grad_norm": 0.213652830993624, "learning_rate": 1.2191297530380244e-05, "loss": 0.3492, "step": 2214 }, { "epoch": 2.3439153439153437, "grad_norm": 0.211527905920233, "learning_rate": 1.2171697373578988e-05, "loss": 0.372, "step": 2215 }, { "epoch": 2.344973544973545, "grad_norm": 0.20693192379565375, "learning_rate": 1.2152097216777734e-05, "loss": 0.3772, "step": 2216 }, { "epoch": 2.346031746031746, "grad_norm": 0.20506549919992692, "learning_rate": 1.213249705997648e-05, "loss": 0.3506, "step": 2217 }, { "epoch": 2.3470899470899473, "grad_norm": 0.29640440277261043, "learning_rate": 1.2112896903175226e-05, "loss": 0.4008, "step": 2218 }, { "epoch": 2.348148148148148, "grad_norm": 0.21563211935876653, "learning_rate": 1.2093296746373972e-05, "loss": 0.366, "step": 2219 }, { "epoch": 2.3492063492063493, "grad_norm": 0.20236613701883793, "learning_rate": 1.2073696589572717e-05, "loss": 0.3492, "step": 2220 }, { "epoch": 2.3502645502645505, "grad_norm": 0.1966092544544384, "learning_rate": 1.2054096432771463e-05, "loss": 0.3317, "step": 2221 }, { "epoch": 2.3513227513227513, "grad_norm": 0.21784607574068482, "learning_rate": 1.203449627597021e-05, "loss": 0.3847, "step": 2222 }, { "epoch": 2.3523809523809525, "grad_norm": 0.21084416216589955, "learning_rate": 1.2014896119168955e-05, "loss": 0.3637, "step": 2223 }, { "epoch": 2.3534391534391537, "grad_norm": 0.1991773340459715, "learning_rate": 1.19952959623677e-05, "loss": 0.3303, "step": 2224 }, { "epoch": 2.3544973544973544, "grad_norm": 0.1918709626897971, "learning_rate": 1.1975695805566445e-05, "loss": 0.3025, "step": 2225 }, { "epoch": 2.3555555555555556, "grad_norm": 0.19838360255526297, "learning_rate": 1.1956095648765191e-05, "loss": 0.3105, "step": 2226 }, { "epoch": 2.356613756613757, "grad_norm": 0.20910709785260412, "learning_rate": 1.1936495491963937e-05, "loss": 0.3434, "step": 2227 }, { "epoch": 2.3576719576719576, "grad_norm": 0.20813311118331637, "learning_rate": 1.1916895335162683e-05, "loss": 0.3388, "step": 2228 }, { "epoch": 2.358730158730159, "grad_norm": 0.21726113463115548, "learning_rate": 1.1897295178361427e-05, "loss": 0.3551, "step": 2229 }, { "epoch": 2.35978835978836, "grad_norm": 0.21011962247860588, "learning_rate": 1.1877695021560173e-05, "loss": 0.3509, "step": 2230 }, { "epoch": 2.3608465608465607, "grad_norm": 0.1953218231695064, "learning_rate": 1.1858094864758919e-05, "loss": 0.319, "step": 2231 }, { "epoch": 2.361904761904762, "grad_norm": 0.20596213011515244, "learning_rate": 1.1838494707957665e-05, "loss": 0.3517, "step": 2232 }, { "epoch": 2.362962962962963, "grad_norm": 0.2126925193532187, "learning_rate": 1.1818894551156409e-05, "loss": 0.3767, "step": 2233 }, { "epoch": 2.364021164021164, "grad_norm": 0.19743855767120733, "learning_rate": 1.1799294394355155e-05, "loss": 0.3302, "step": 2234 }, { "epoch": 2.365079365079365, "grad_norm": 0.1867269633062068, "learning_rate": 1.17796942375539e-05, "loss": 0.3275, "step": 2235 }, { "epoch": 2.3661375661375663, "grad_norm": 0.19798324638566683, "learning_rate": 1.1760094080752646e-05, "loss": 0.3358, "step": 2236 }, { "epoch": 2.367195767195767, "grad_norm": 0.21204816572937232, "learning_rate": 1.1740493923951392e-05, "loss": 0.3557, "step": 2237 }, { "epoch": 2.3682539682539683, "grad_norm": 0.20561285406488616, "learning_rate": 1.1720893767150138e-05, "loss": 0.3859, "step": 2238 }, { "epoch": 2.3693121693121695, "grad_norm": 0.1844307143572627, "learning_rate": 1.1701293610348884e-05, "loss": 0.3093, "step": 2239 }, { "epoch": 2.3703703703703702, "grad_norm": 0.18285268707845753, "learning_rate": 1.168169345354763e-05, "loss": 0.2879, "step": 2240 }, { "epoch": 2.3714285714285714, "grad_norm": 0.18891596600216767, "learning_rate": 1.1662093296746376e-05, "loss": 0.3164, "step": 2241 }, { "epoch": 2.3724867724867726, "grad_norm": 0.21443463118106196, "learning_rate": 1.164249313994512e-05, "loss": 0.3724, "step": 2242 }, { "epoch": 2.3735449735449734, "grad_norm": 0.20092191567682116, "learning_rate": 1.1622892983143866e-05, "loss": 0.336, "step": 2243 }, { "epoch": 2.3746031746031746, "grad_norm": 0.21661040764716843, "learning_rate": 1.1603292826342612e-05, "loss": 0.3457, "step": 2244 }, { "epoch": 2.375661375661376, "grad_norm": 0.190041744765612, "learning_rate": 1.1583692669541358e-05, "loss": 0.3251, "step": 2245 }, { "epoch": 2.3767195767195766, "grad_norm": 0.1886509935215561, "learning_rate": 1.1564092512740103e-05, "loss": 0.3268, "step": 2246 }, { "epoch": 2.3777777777777778, "grad_norm": 0.19137796325706583, "learning_rate": 1.1544492355938848e-05, "loss": 0.3137, "step": 2247 }, { "epoch": 2.378835978835979, "grad_norm": 0.19633048102603665, "learning_rate": 1.1524892199137593e-05, "loss": 0.3428, "step": 2248 }, { "epoch": 2.3798941798941797, "grad_norm": 0.20298501174663242, "learning_rate": 1.150529204233634e-05, "loss": 0.3336, "step": 2249 }, { "epoch": 2.380952380952381, "grad_norm": 0.20915086314748638, "learning_rate": 1.1485691885535085e-05, "loss": 0.3538, "step": 2250 }, { "epoch": 2.382010582010582, "grad_norm": 0.1959406139546007, "learning_rate": 1.146609172873383e-05, "loss": 0.3204, "step": 2251 }, { "epoch": 2.383068783068783, "grad_norm": 0.1980356952193389, "learning_rate": 1.1446491571932575e-05, "loss": 0.3369, "step": 2252 }, { "epoch": 2.384126984126984, "grad_norm": 0.21603527021327792, "learning_rate": 1.1426891415131321e-05, "loss": 0.4106, "step": 2253 }, { "epoch": 2.3851851851851853, "grad_norm": 0.19272692236278238, "learning_rate": 1.1407291258330067e-05, "loss": 0.3393, "step": 2254 }, { "epoch": 2.386243386243386, "grad_norm": 0.1956248707684031, "learning_rate": 1.1387691101528813e-05, "loss": 0.3411, "step": 2255 }, { "epoch": 2.3873015873015873, "grad_norm": 0.19014685494467712, "learning_rate": 1.1368090944727559e-05, "loss": 0.3185, "step": 2256 }, { "epoch": 2.3883597883597885, "grad_norm": 0.20447615014158566, "learning_rate": 1.1348490787926305e-05, "loss": 0.3513, "step": 2257 }, { "epoch": 2.389417989417989, "grad_norm": 0.21079536892016623, "learning_rate": 1.132889063112505e-05, "loss": 0.3377, "step": 2258 }, { "epoch": 2.3904761904761904, "grad_norm": 0.20403855917854744, "learning_rate": 1.1309290474323796e-05, "loss": 0.3728, "step": 2259 }, { "epoch": 2.3915343915343916, "grad_norm": 0.19160044266304876, "learning_rate": 1.128969031752254e-05, "loss": 0.3113, "step": 2260 }, { "epoch": 2.3925925925925924, "grad_norm": 0.2081785958094779, "learning_rate": 1.1270090160721286e-05, "loss": 0.3618, "step": 2261 }, { "epoch": 2.3936507936507936, "grad_norm": 0.2080027481507071, "learning_rate": 1.1250490003920032e-05, "loss": 0.3484, "step": 2262 }, { "epoch": 2.394708994708995, "grad_norm": 0.2048194108111222, "learning_rate": 1.1230889847118778e-05, "loss": 0.3429, "step": 2263 }, { "epoch": 2.3957671957671955, "grad_norm": 0.2103500941268802, "learning_rate": 1.1211289690317522e-05, "loss": 0.3665, "step": 2264 }, { "epoch": 2.3968253968253967, "grad_norm": 0.21252569969121238, "learning_rate": 1.1191689533516268e-05, "loss": 0.3346, "step": 2265 }, { "epoch": 2.397883597883598, "grad_norm": 0.19587149638160067, "learning_rate": 1.1172089376715014e-05, "loss": 0.3352, "step": 2266 }, { "epoch": 2.3989417989417987, "grad_norm": 0.21361691084135886, "learning_rate": 1.115248921991376e-05, "loss": 0.3695, "step": 2267 }, { "epoch": 2.4, "grad_norm": 0.1860233174405519, "learning_rate": 1.1132889063112506e-05, "loss": 0.3049, "step": 2268 }, { "epoch": 2.401058201058201, "grad_norm": 0.21531066517258854, "learning_rate": 1.111328890631125e-05, "loss": 0.3919, "step": 2269 }, { "epoch": 2.402116402116402, "grad_norm": 0.20973338346864226, "learning_rate": 1.1093688749509996e-05, "loss": 0.309, "step": 2270 }, { "epoch": 2.403174603174603, "grad_norm": 0.19942109842684344, "learning_rate": 1.1074088592708742e-05, "loss": 0.3438, "step": 2271 }, { "epoch": 2.4042328042328043, "grad_norm": 0.20332330726245493, "learning_rate": 1.1054488435907488e-05, "loss": 0.3716, "step": 2272 }, { "epoch": 2.4052910052910055, "grad_norm": 0.2109034509123887, "learning_rate": 1.1034888279106234e-05, "loss": 0.3815, "step": 2273 }, { "epoch": 2.4063492063492062, "grad_norm": 0.20967094697117655, "learning_rate": 1.101528812230498e-05, "loss": 0.3791, "step": 2274 }, { "epoch": 2.4074074074074074, "grad_norm": 0.20137962520759609, "learning_rate": 1.0995687965503725e-05, "loss": 0.352, "step": 2275 }, { "epoch": 2.4084656084656086, "grad_norm": 0.1959407595415704, "learning_rate": 1.0976087808702471e-05, "loss": 0.3128, "step": 2276 }, { "epoch": 2.4095238095238094, "grad_norm": 0.2025181599605024, "learning_rate": 1.0956487651901217e-05, "loss": 0.3643, "step": 2277 }, { "epoch": 2.4105820105820106, "grad_norm": 0.230544012868408, "learning_rate": 1.0936887495099961e-05, "loss": 0.3726, "step": 2278 }, { "epoch": 2.411640211640212, "grad_norm": 0.1990066422140862, "learning_rate": 1.0917287338298707e-05, "loss": 0.3638, "step": 2279 }, { "epoch": 2.4126984126984126, "grad_norm": 0.18998917405534244, "learning_rate": 1.0897687181497453e-05, "loss": 0.3274, "step": 2280 }, { "epoch": 2.4137566137566138, "grad_norm": 0.20400621001125108, "learning_rate": 1.0878087024696199e-05, "loss": 0.3977, "step": 2281 }, { "epoch": 2.414814814814815, "grad_norm": 0.19362884565369348, "learning_rate": 1.0858486867894943e-05, "loss": 0.2982, "step": 2282 }, { "epoch": 2.4158730158730157, "grad_norm": 0.19304320410771325, "learning_rate": 1.0838886711093689e-05, "loss": 0.3363, "step": 2283 }, { "epoch": 2.416931216931217, "grad_norm": 0.18681427914983176, "learning_rate": 1.0819286554292435e-05, "loss": 0.2841, "step": 2284 }, { "epoch": 2.417989417989418, "grad_norm": 0.20184125426045912, "learning_rate": 1.079968639749118e-05, "loss": 0.3428, "step": 2285 }, { "epoch": 2.419047619047619, "grad_norm": 0.2037706142983145, "learning_rate": 1.0780086240689927e-05, "loss": 0.3649, "step": 2286 }, { "epoch": 2.42010582010582, "grad_norm": 0.19817399493675728, "learning_rate": 1.076048608388867e-05, "loss": 0.3506, "step": 2287 }, { "epoch": 2.4211640211640213, "grad_norm": 0.20169589584258404, "learning_rate": 1.0740885927087417e-05, "loss": 0.357, "step": 2288 }, { "epoch": 2.422222222222222, "grad_norm": 0.20942728369583866, "learning_rate": 1.0721285770286162e-05, "loss": 0.3732, "step": 2289 }, { "epoch": 2.4232804232804233, "grad_norm": 0.19265517759072814, "learning_rate": 1.0701685613484908e-05, "loss": 0.3176, "step": 2290 }, { "epoch": 2.4243386243386245, "grad_norm": 0.1982086084106544, "learning_rate": 1.0682085456683654e-05, "loss": 0.3467, "step": 2291 }, { "epoch": 2.425396825396825, "grad_norm": 0.19447098693465864, "learning_rate": 1.06624852998824e-05, "loss": 0.3299, "step": 2292 }, { "epoch": 2.4264550264550264, "grad_norm": 0.20888212480507384, "learning_rate": 1.0642885143081146e-05, "loss": 0.3459, "step": 2293 }, { "epoch": 2.4275132275132276, "grad_norm": 0.208144303971245, "learning_rate": 1.0623284986279892e-05, "loss": 0.367, "step": 2294 }, { "epoch": 2.4285714285714284, "grad_norm": 0.20876100051287758, "learning_rate": 1.0603684829478638e-05, "loss": 0.3745, "step": 2295 }, { "epoch": 2.4296296296296296, "grad_norm": 0.20791252154146914, "learning_rate": 1.0584084672677382e-05, "loss": 0.3491, "step": 2296 }, { "epoch": 2.430687830687831, "grad_norm": 0.19777559356235025, "learning_rate": 1.0564484515876128e-05, "loss": 0.3374, "step": 2297 }, { "epoch": 2.431746031746032, "grad_norm": 0.19110723755738349, "learning_rate": 1.0544884359074874e-05, "loss": 0.3271, "step": 2298 }, { "epoch": 2.4328042328042327, "grad_norm": 0.2008378278505105, "learning_rate": 1.052528420227362e-05, "loss": 0.3387, "step": 2299 }, { "epoch": 2.433862433862434, "grad_norm": 0.21027780382458788, "learning_rate": 1.0505684045472364e-05, "loss": 0.3179, "step": 2300 }, { "epoch": 2.434920634920635, "grad_norm": 0.2026493429086862, "learning_rate": 1.048608388867111e-05, "loss": 0.3345, "step": 2301 }, { "epoch": 2.435978835978836, "grad_norm": 0.20274404356172807, "learning_rate": 1.0466483731869855e-05, "loss": 0.3801, "step": 2302 }, { "epoch": 2.437037037037037, "grad_norm": 0.19273549407390878, "learning_rate": 1.0446883575068601e-05, "loss": 0.3467, "step": 2303 }, { "epoch": 2.4380952380952383, "grad_norm": 0.1818577084303832, "learning_rate": 1.0427283418267345e-05, "loss": 0.3202, "step": 2304 }, { "epoch": 2.439153439153439, "grad_norm": 0.19560068167078504, "learning_rate": 1.0407683261466091e-05, "loss": 0.3273, "step": 2305 }, { "epoch": 2.4402116402116403, "grad_norm": 0.20370517607190086, "learning_rate": 1.0388083104664837e-05, "loss": 0.3403, "step": 2306 }, { "epoch": 2.4412698412698415, "grad_norm": 0.20114205886933845, "learning_rate": 1.0368482947863583e-05, "loss": 0.3522, "step": 2307 }, { "epoch": 2.4423280423280422, "grad_norm": 0.20882464854711777, "learning_rate": 1.0348882791062329e-05, "loss": 0.3666, "step": 2308 }, { "epoch": 2.4433862433862434, "grad_norm": 0.20438756339590045, "learning_rate": 1.0329282634261075e-05, "loss": 0.3435, "step": 2309 }, { "epoch": 2.4444444444444446, "grad_norm": 0.22064605217007735, "learning_rate": 1.030968247745982e-05, "loss": 0.4142, "step": 2310 }, { "epoch": 2.4455026455026454, "grad_norm": 0.1817014882330108, "learning_rate": 1.0290082320658567e-05, "loss": 0.2751, "step": 2311 }, { "epoch": 2.4465608465608466, "grad_norm": 0.19417363691429626, "learning_rate": 1.0270482163857312e-05, "loss": 0.347, "step": 2312 }, { "epoch": 2.447619047619048, "grad_norm": 0.19984236991627735, "learning_rate": 1.0250882007056057e-05, "loss": 0.358, "step": 2313 }, { "epoch": 2.4486772486772486, "grad_norm": 0.20561224905848144, "learning_rate": 1.0231281850254803e-05, "loss": 0.3748, "step": 2314 }, { "epoch": 2.4497354497354498, "grad_norm": 0.2030555882275226, "learning_rate": 1.0211681693453548e-05, "loss": 0.3333, "step": 2315 }, { "epoch": 2.450793650793651, "grad_norm": 0.18955391416822132, "learning_rate": 1.0192081536652294e-05, "loss": 0.3067, "step": 2316 }, { "epoch": 2.4518518518518517, "grad_norm": 0.214593330571037, "learning_rate": 1.017248137985104e-05, "loss": 0.4056, "step": 2317 }, { "epoch": 2.452910052910053, "grad_norm": 0.20084964435456928, "learning_rate": 1.0152881223049784e-05, "loss": 0.3302, "step": 2318 }, { "epoch": 2.453968253968254, "grad_norm": 0.19398682980431026, "learning_rate": 1.013328106624853e-05, "loss": 0.3213, "step": 2319 }, { "epoch": 2.455026455026455, "grad_norm": 0.22095190181637556, "learning_rate": 1.0113680909447276e-05, "loss": 0.4104, "step": 2320 }, { "epoch": 2.456084656084656, "grad_norm": 0.21142265935421076, "learning_rate": 1.0094080752646022e-05, "loss": 0.3691, "step": 2321 }, { "epoch": 2.4571428571428573, "grad_norm": 0.20774600162711074, "learning_rate": 1.0074480595844766e-05, "loss": 0.3576, "step": 2322 }, { "epoch": 2.458201058201058, "grad_norm": 0.19470750734709114, "learning_rate": 1.0054880439043512e-05, "loss": 0.3391, "step": 2323 }, { "epoch": 2.4592592592592593, "grad_norm": 0.20179308051014488, "learning_rate": 1.0035280282242258e-05, "loss": 0.3753, "step": 2324 }, { "epoch": 2.4603174603174605, "grad_norm": 0.1964229149495673, "learning_rate": 1.0015680125441004e-05, "loss": 0.323, "step": 2325 }, { "epoch": 2.461375661375661, "grad_norm": 0.20063757362174908, "learning_rate": 9.99607996863975e-06, "loss": 0.3787, "step": 2326 }, { "epoch": 2.4624338624338624, "grad_norm": 0.1930747913002795, "learning_rate": 9.976479811838496e-06, "loss": 0.319, "step": 2327 }, { "epoch": 2.4634920634920636, "grad_norm": 0.21711063943337874, "learning_rate": 9.956879655037241e-06, "loss": 0.3542, "step": 2328 }, { "epoch": 2.4645502645502644, "grad_norm": 0.1903790970553673, "learning_rate": 9.937279498235987e-06, "loss": 0.3267, "step": 2329 }, { "epoch": 2.4656084656084656, "grad_norm": 0.3974441833827003, "learning_rate": 9.917679341434733e-06, "loss": 0.3863, "step": 2330 }, { "epoch": 2.466666666666667, "grad_norm": 0.21646430718812706, "learning_rate": 9.898079184633477e-06, "loss": 0.3876, "step": 2331 }, { "epoch": 2.4677248677248675, "grad_norm": 0.22008541249629765, "learning_rate": 9.878479027832223e-06, "loss": 0.328, "step": 2332 }, { "epoch": 2.4687830687830687, "grad_norm": 0.19270815027961807, "learning_rate": 9.858878871030969e-06, "loss": 0.3422, "step": 2333 }, { "epoch": 2.46984126984127, "grad_norm": 0.2002872486600827, "learning_rate": 9.839278714229715e-06, "loss": 0.3509, "step": 2334 }, { "epoch": 2.4708994708994707, "grad_norm": 0.216151009251505, "learning_rate": 9.81967855742846e-06, "loss": 0.3872, "step": 2335 }, { "epoch": 2.471957671957672, "grad_norm": 0.21555323390899783, "learning_rate": 9.800078400627205e-06, "loss": 0.3542, "step": 2336 }, { "epoch": 2.473015873015873, "grad_norm": 0.18992518981588918, "learning_rate": 9.780478243825951e-06, "loss": 0.3061, "step": 2337 }, { "epoch": 2.474074074074074, "grad_norm": 0.20207716846236165, "learning_rate": 9.760878087024697e-06, "loss": 0.3492, "step": 2338 }, { "epoch": 2.475132275132275, "grad_norm": 0.3238919978391247, "learning_rate": 9.741277930223443e-06, "loss": 0.3811, "step": 2339 }, { "epoch": 2.4761904761904763, "grad_norm": 0.20016631554309963, "learning_rate": 9.721677773422187e-06, "loss": 0.3375, "step": 2340 }, { "epoch": 2.477248677248677, "grad_norm": 0.2099603304873576, "learning_rate": 9.702077616620933e-06, "loss": 0.3601, "step": 2341 }, { "epoch": 2.4783068783068782, "grad_norm": 0.2163092177109193, "learning_rate": 9.682477459819679e-06, "loss": 0.3929, "step": 2342 }, { "epoch": 2.4793650793650794, "grad_norm": 0.195942799842103, "learning_rate": 9.662877303018424e-06, "loss": 0.3259, "step": 2343 }, { "epoch": 2.48042328042328, "grad_norm": 0.20028542223736842, "learning_rate": 9.64327714621717e-06, "loss": 0.3195, "step": 2344 }, { "epoch": 2.4814814814814814, "grad_norm": 0.2034772057737831, "learning_rate": 9.623676989415916e-06, "loss": 0.3481, "step": 2345 }, { "epoch": 2.4825396825396826, "grad_norm": 0.19210503275161703, "learning_rate": 9.604076832614662e-06, "loss": 0.3252, "step": 2346 }, { "epoch": 2.4835978835978834, "grad_norm": 0.2096974621839747, "learning_rate": 9.584476675813408e-06, "loss": 0.3513, "step": 2347 }, { "epoch": 2.4846560846560846, "grad_norm": 0.20429072459845538, "learning_rate": 9.564876519012154e-06, "loss": 0.3549, "step": 2348 }, { "epoch": 2.4857142857142858, "grad_norm": 0.18861327099569755, "learning_rate": 9.545276362210898e-06, "loss": 0.3381, "step": 2349 }, { "epoch": 2.4867724867724865, "grad_norm": 0.19260260318882602, "learning_rate": 9.525676205409644e-06, "loss": 0.3248, "step": 2350 }, { "epoch": 2.4878306878306877, "grad_norm": 0.2121251443373568, "learning_rate": 9.50607604860839e-06, "loss": 0.3893, "step": 2351 }, { "epoch": 2.488888888888889, "grad_norm": 0.2153555638932981, "learning_rate": 9.486475891807136e-06, "loss": 0.4048, "step": 2352 }, { "epoch": 2.48994708994709, "grad_norm": 0.21789073463498923, "learning_rate": 9.46687573500588e-06, "loss": 0.3747, "step": 2353 }, { "epoch": 2.491005291005291, "grad_norm": 0.20505225693714865, "learning_rate": 9.447275578204626e-06, "loss": 0.3438, "step": 2354 }, { "epoch": 2.492063492063492, "grad_norm": 0.19519515384302688, "learning_rate": 9.427675421403372e-06, "loss": 0.324, "step": 2355 }, { "epoch": 2.4931216931216933, "grad_norm": 0.19903623860168693, "learning_rate": 9.408075264602117e-06, "loss": 0.3503, "step": 2356 }, { "epoch": 2.494179894179894, "grad_norm": 0.2075371111288816, "learning_rate": 9.388475107800863e-06, "loss": 0.3645, "step": 2357 }, { "epoch": 2.4952380952380953, "grad_norm": 0.20375379109475755, "learning_rate": 9.368874950999607e-06, "loss": 0.322, "step": 2358 }, { "epoch": 2.4962962962962965, "grad_norm": 0.22033682656514744, "learning_rate": 9.349274794198353e-06, "loss": 0.3896, "step": 2359 }, { "epoch": 2.497354497354497, "grad_norm": 0.19640384262879929, "learning_rate": 9.3296746373971e-06, "loss": 0.312, "step": 2360 }, { "epoch": 2.4984126984126984, "grad_norm": 0.2081466305517114, "learning_rate": 9.310074480595845e-06, "loss": 0.3352, "step": 2361 }, { "epoch": 2.4994708994708996, "grad_norm": 0.1999811658493017, "learning_rate": 9.290474323794591e-06, "loss": 0.34, "step": 2362 }, { "epoch": 2.5005291005291004, "grad_norm": 0.20242709998710268, "learning_rate": 9.270874166993337e-06, "loss": 0.324, "step": 2363 }, { "epoch": 2.5015873015873016, "grad_norm": 0.1890758571530466, "learning_rate": 9.251274010192083e-06, "loss": 0.3205, "step": 2364 }, { "epoch": 2.502645502645503, "grad_norm": 0.2018935760610972, "learning_rate": 9.231673853390829e-06, "loss": 0.3512, "step": 2365 }, { "epoch": 2.5037037037037035, "grad_norm": 0.1905973585991993, "learning_rate": 9.212073696589574e-06, "loss": 0.3184, "step": 2366 }, { "epoch": 2.5047619047619047, "grad_norm": 0.20119197145153586, "learning_rate": 9.192473539788319e-06, "loss": 0.3715, "step": 2367 }, { "epoch": 2.505820105820106, "grad_norm": 0.22913278918931926, "learning_rate": 9.172873382987065e-06, "loss": 0.3569, "step": 2368 }, { "epoch": 2.506878306878307, "grad_norm": 0.21451859754659375, "learning_rate": 9.15327322618581e-06, "loss": 0.3475, "step": 2369 }, { "epoch": 2.507936507936508, "grad_norm": 0.18032408092733698, "learning_rate": 9.133673069384556e-06, "loss": 0.2882, "step": 2370 }, { "epoch": 2.508994708994709, "grad_norm": 0.18785546074799372, "learning_rate": 9.1140729125833e-06, "loss": 0.321, "step": 2371 }, { "epoch": 2.5100529100529103, "grad_norm": 0.2081663751474157, "learning_rate": 9.094472755782046e-06, "loss": 0.37, "step": 2372 }, { "epoch": 2.511111111111111, "grad_norm": 0.19467624754830543, "learning_rate": 9.074872598980792e-06, "loss": 0.3151, "step": 2373 }, { "epoch": 2.5121693121693123, "grad_norm": 0.21111318671913534, "learning_rate": 9.055272442179538e-06, "loss": 0.3559, "step": 2374 }, { "epoch": 2.5132275132275135, "grad_norm": 0.20158857860626356, "learning_rate": 9.035672285378284e-06, "loss": 0.3413, "step": 2375 }, { "epoch": 2.5142857142857142, "grad_norm": 0.19967004938402297, "learning_rate": 9.016072128577028e-06, "loss": 0.324, "step": 2376 }, { "epoch": 2.5153439153439154, "grad_norm": 0.20043339966519105, "learning_rate": 8.996471971775774e-06, "loss": 0.3575, "step": 2377 }, { "epoch": 2.5164021164021166, "grad_norm": 0.2090345217187549, "learning_rate": 8.97687181497452e-06, "loss": 0.3647, "step": 2378 }, { "epoch": 2.5174603174603174, "grad_norm": 0.19573505746191802, "learning_rate": 8.957271658173266e-06, "loss": 0.3559, "step": 2379 }, { "epoch": 2.5185185185185186, "grad_norm": 0.2108974750518579, "learning_rate": 8.937671501372012e-06, "loss": 0.3635, "step": 2380 }, { "epoch": 2.51957671957672, "grad_norm": 0.200163482912999, "learning_rate": 8.918071344570758e-06, "loss": 0.3469, "step": 2381 }, { "epoch": 2.5206349206349206, "grad_norm": 0.20864956888060593, "learning_rate": 8.898471187769503e-06, "loss": 0.3657, "step": 2382 }, { "epoch": 2.5216931216931218, "grad_norm": 0.19842095672056884, "learning_rate": 8.87887103096825e-06, "loss": 0.3738, "step": 2383 }, { "epoch": 2.522751322751323, "grad_norm": 0.2135741099116242, "learning_rate": 8.859270874166995e-06, "loss": 0.4217, "step": 2384 }, { "epoch": 2.5238095238095237, "grad_norm": 0.1869862686126683, "learning_rate": 8.83967071736574e-06, "loss": 0.3287, "step": 2385 }, { "epoch": 2.524867724867725, "grad_norm": 0.2079464545823666, "learning_rate": 8.820070560564485e-06, "loss": 0.3214, "step": 2386 }, { "epoch": 2.525925925925926, "grad_norm": 0.2196057878276039, "learning_rate": 8.800470403763231e-06, "loss": 0.3939, "step": 2387 }, { "epoch": 2.526984126984127, "grad_norm": 0.20564907375985877, "learning_rate": 8.780870246961977e-06, "loss": 0.3698, "step": 2388 }, { "epoch": 2.528042328042328, "grad_norm": 0.18394751990532987, "learning_rate": 8.761270090160721e-06, "loss": 0.293, "step": 2389 }, { "epoch": 2.5291005291005293, "grad_norm": 0.21363483672053832, "learning_rate": 8.741669933359467e-06, "loss": 0.303, "step": 2390 }, { "epoch": 2.53015873015873, "grad_norm": 0.20780527897074721, "learning_rate": 8.722069776558213e-06, "loss": 0.3877, "step": 2391 }, { "epoch": 2.5312169312169313, "grad_norm": 0.19783156319626172, "learning_rate": 8.702469619756959e-06, "loss": 0.3512, "step": 2392 }, { "epoch": 2.5322751322751325, "grad_norm": 0.1892822612166525, "learning_rate": 8.682869462955705e-06, "loss": 0.3275, "step": 2393 }, { "epoch": 2.533333333333333, "grad_norm": 0.19170942956582057, "learning_rate": 8.663269306154449e-06, "loss": 0.3128, "step": 2394 }, { "epoch": 2.5343915343915344, "grad_norm": 0.21434651400329688, "learning_rate": 8.643669149353195e-06, "loss": 0.371, "step": 2395 }, { "epoch": 2.5354497354497356, "grad_norm": 0.20111551143204334, "learning_rate": 8.62406899255194e-06, "loss": 0.3564, "step": 2396 }, { "epoch": 2.5365079365079364, "grad_norm": 0.18854536443130432, "learning_rate": 8.604468835750686e-06, "loss": 0.3282, "step": 2397 }, { "epoch": 2.5375661375661376, "grad_norm": 0.2024488750022803, "learning_rate": 8.584868678949432e-06, "loss": 0.3738, "step": 2398 }, { "epoch": 2.538624338624339, "grad_norm": 0.20611891154190004, "learning_rate": 8.565268522148178e-06, "loss": 0.3811, "step": 2399 }, { "epoch": 2.5396825396825395, "grad_norm": 0.20495838632725324, "learning_rate": 8.545668365346924e-06, "loss": 0.3613, "step": 2400 }, { "epoch": 2.5407407407407407, "grad_norm": 0.19072363427225575, "learning_rate": 8.52606820854567e-06, "loss": 0.3369, "step": 2401 }, { "epoch": 2.541798941798942, "grad_norm": 0.19789802048128188, "learning_rate": 8.506468051744414e-06, "loss": 0.3239, "step": 2402 }, { "epoch": 2.5428571428571427, "grad_norm": 0.21881605201200133, "learning_rate": 8.48686789494316e-06, "loss": 0.3218, "step": 2403 }, { "epoch": 2.543915343915344, "grad_norm": 0.20069563327958265, "learning_rate": 8.467267738141906e-06, "loss": 0.3424, "step": 2404 }, { "epoch": 2.544973544973545, "grad_norm": 0.2075442824382541, "learning_rate": 8.447667581340652e-06, "loss": 0.3579, "step": 2405 }, { "epoch": 2.546031746031746, "grad_norm": 0.20209877990797678, "learning_rate": 8.428067424539398e-06, "loss": 0.3658, "step": 2406 }, { "epoch": 2.547089947089947, "grad_norm": 0.20347999261766514, "learning_rate": 8.408467267738142e-06, "loss": 0.3624, "step": 2407 }, { "epoch": 2.5481481481481483, "grad_norm": 0.19601569897380608, "learning_rate": 8.388867110936888e-06, "loss": 0.3342, "step": 2408 }, { "epoch": 2.549206349206349, "grad_norm": 0.20238610460423004, "learning_rate": 8.369266954135634e-06, "loss": 0.3509, "step": 2409 }, { "epoch": 2.5502645502645502, "grad_norm": 0.20063742492135442, "learning_rate": 8.34966679733438e-06, "loss": 0.3398, "step": 2410 }, { "epoch": 2.5513227513227514, "grad_norm": 0.20819913291416195, "learning_rate": 8.330066640533124e-06, "loss": 0.3712, "step": 2411 }, { "epoch": 2.552380952380952, "grad_norm": 0.20770506244650877, "learning_rate": 8.31046648373187e-06, "loss": 0.3702, "step": 2412 }, { "epoch": 2.5534391534391534, "grad_norm": 0.1908698216977231, "learning_rate": 8.290866326930615e-06, "loss": 0.3442, "step": 2413 }, { "epoch": 2.5544973544973546, "grad_norm": 0.22309789081285533, "learning_rate": 8.271266170129361e-06, "loss": 0.3393, "step": 2414 }, { "epoch": 2.5555555555555554, "grad_norm": 0.2047928716854341, "learning_rate": 8.251666013328107e-06, "loss": 0.3602, "step": 2415 }, { "epoch": 2.5566137566137566, "grad_norm": 0.19373128203836848, "learning_rate": 8.232065856526853e-06, "loss": 0.3192, "step": 2416 }, { "epoch": 2.5576719576719578, "grad_norm": 0.2243631469556603, "learning_rate": 8.212465699725599e-06, "loss": 0.3471, "step": 2417 }, { "epoch": 2.5587301587301585, "grad_norm": 0.2030278112277129, "learning_rate": 8.192865542924345e-06, "loss": 0.2942, "step": 2418 }, { "epoch": 2.5597883597883597, "grad_norm": 0.21497459789916237, "learning_rate": 8.17326538612309e-06, "loss": 0.4136, "step": 2419 }, { "epoch": 2.560846560846561, "grad_norm": 0.22645848971121996, "learning_rate": 8.153665229321835e-06, "loss": 0.3705, "step": 2420 }, { "epoch": 2.5619047619047617, "grad_norm": 0.18927547831969765, "learning_rate": 8.13406507252058e-06, "loss": 0.287, "step": 2421 }, { "epoch": 2.562962962962963, "grad_norm": 0.19829367281019175, "learning_rate": 8.114464915719327e-06, "loss": 0.3342, "step": 2422 }, { "epoch": 2.564021164021164, "grad_norm": 0.21043939582778504, "learning_rate": 8.094864758918072e-06, "loss": 0.371, "step": 2423 }, { "epoch": 2.565079365079365, "grad_norm": 0.21176652701168425, "learning_rate": 8.075264602116818e-06, "loss": 0.361, "step": 2424 }, { "epoch": 2.566137566137566, "grad_norm": 0.19858482433563368, "learning_rate": 8.055664445315562e-06, "loss": 0.3435, "step": 2425 }, { "epoch": 2.5671957671957673, "grad_norm": 0.1930389532486009, "learning_rate": 8.036064288514308e-06, "loss": 0.3116, "step": 2426 }, { "epoch": 2.568253968253968, "grad_norm": 0.2017032704364643, "learning_rate": 8.016464131713054e-06, "loss": 0.3526, "step": 2427 }, { "epoch": 2.569312169312169, "grad_norm": 0.2100015873501434, "learning_rate": 7.9968639749118e-06, "loss": 0.3647, "step": 2428 }, { "epoch": 2.5703703703703704, "grad_norm": 0.20235907257311597, "learning_rate": 7.977263818110544e-06, "loss": 0.3405, "step": 2429 }, { "epoch": 2.571428571428571, "grad_norm": 0.19998918287012984, "learning_rate": 7.95766366130929e-06, "loss": 0.376, "step": 2430 }, { "epoch": 2.5724867724867724, "grad_norm": 0.19691447139385335, "learning_rate": 7.938063504508036e-06, "loss": 0.3494, "step": 2431 }, { "epoch": 2.5735449735449736, "grad_norm": 0.2135251585389222, "learning_rate": 7.918463347706782e-06, "loss": 0.3532, "step": 2432 }, { "epoch": 2.5746031746031743, "grad_norm": 0.201995066867421, "learning_rate": 7.898863190905528e-06, "loss": 0.3443, "step": 2433 }, { "epoch": 2.5756613756613755, "grad_norm": 0.18780759974783323, "learning_rate": 7.879263034104274e-06, "loss": 0.3067, "step": 2434 }, { "epoch": 2.5767195767195767, "grad_norm": 0.19161770345353862, "learning_rate": 7.85966287730302e-06, "loss": 0.3165, "step": 2435 }, { "epoch": 2.5777777777777775, "grad_norm": 2.4738622305406635, "learning_rate": 7.840062720501765e-06, "loss": 0.4718, "step": 2436 }, { "epoch": 2.5788359788359787, "grad_norm": 0.2163467306235148, "learning_rate": 7.820462563700511e-06, "loss": 0.3272, "step": 2437 }, { "epoch": 2.57989417989418, "grad_norm": 0.20305099508166402, "learning_rate": 7.800862406899255e-06, "loss": 0.3354, "step": 2438 }, { "epoch": 2.580952380952381, "grad_norm": 0.2251581241069781, "learning_rate": 7.781262250098001e-06, "loss": 0.3525, "step": 2439 }, { "epoch": 2.582010582010582, "grad_norm": 0.20829738004575854, "learning_rate": 7.761662093296747e-06, "loss": 0.357, "step": 2440 }, { "epoch": 2.583068783068783, "grad_norm": 0.20453941398613518, "learning_rate": 7.742061936495493e-06, "loss": 0.3655, "step": 2441 }, { "epoch": 2.5841269841269843, "grad_norm": 0.2084221126285046, "learning_rate": 7.722461779694237e-06, "loss": 0.3545, "step": 2442 }, { "epoch": 2.585185185185185, "grad_norm": 0.20422846872501063, "learning_rate": 7.702861622892983e-06, "loss": 0.3505, "step": 2443 }, { "epoch": 2.5862433862433862, "grad_norm": 0.23322575042300528, "learning_rate": 7.683261466091729e-06, "loss": 0.4038, "step": 2444 }, { "epoch": 2.5873015873015874, "grad_norm": 0.2149540741964415, "learning_rate": 7.663661309290475e-06, "loss": 0.365, "step": 2445 }, { "epoch": 2.588359788359788, "grad_norm": 0.23658255878401824, "learning_rate": 7.64406115248922e-06, "loss": 0.3352, "step": 2446 }, { "epoch": 2.5894179894179894, "grad_norm": 0.193188372100089, "learning_rate": 7.624460995687966e-06, "loss": 0.3533, "step": 2447 }, { "epoch": 2.5904761904761906, "grad_norm": 0.2156461994282791, "learning_rate": 7.604860838886712e-06, "loss": 0.3092, "step": 2448 }, { "epoch": 2.591534391534392, "grad_norm": 0.21459254756654236, "learning_rate": 7.5852606820854575e-06, "loss": 0.371, "step": 2449 }, { "epoch": 2.5925925925925926, "grad_norm": 0.20067238928848508, "learning_rate": 7.565660525284203e-06, "loss": 0.3334, "step": 2450 }, { "epoch": 2.5936507936507938, "grad_norm": 0.20761937713286907, "learning_rate": 7.5460603684829476e-06, "loss": 0.348, "step": 2451 }, { "epoch": 2.594708994708995, "grad_norm": 0.19344185405933195, "learning_rate": 7.5264602116816934e-06, "loss": 0.3343, "step": 2452 }, { "epoch": 2.5957671957671957, "grad_norm": 0.20457097110759648, "learning_rate": 7.506860054880439e-06, "loss": 0.3588, "step": 2453 }, { "epoch": 2.596825396825397, "grad_norm": 0.19256191638601902, "learning_rate": 7.487259898079185e-06, "loss": 0.3044, "step": 2454 }, { "epoch": 2.597883597883598, "grad_norm": 0.19715791865716, "learning_rate": 7.467659741277931e-06, "loss": 0.3438, "step": 2455 }, { "epoch": 2.598941798941799, "grad_norm": 0.20066467071473018, "learning_rate": 7.448059584476676e-06, "loss": 0.3708, "step": 2456 }, { "epoch": 2.6, "grad_norm": 0.2061072533569407, "learning_rate": 7.428459427675422e-06, "loss": 0.365, "step": 2457 }, { "epoch": 2.6010582010582013, "grad_norm": 0.190635839627958, "learning_rate": 7.408859270874168e-06, "loss": 0.3185, "step": 2458 }, { "epoch": 2.602116402116402, "grad_norm": 0.21002157908314345, "learning_rate": 7.389259114072914e-06, "loss": 0.3322, "step": 2459 }, { "epoch": 2.6031746031746033, "grad_norm": 0.18713946271024856, "learning_rate": 7.369658957271658e-06, "loss": 0.3126, "step": 2460 }, { "epoch": 2.6042328042328045, "grad_norm": 0.2132276955579535, "learning_rate": 7.350058800470404e-06, "loss": 0.3714, "step": 2461 }, { "epoch": 2.605291005291005, "grad_norm": 0.1976682334299022, "learning_rate": 7.33045864366915e-06, "loss": 0.3446, "step": 2462 }, { "epoch": 2.6063492063492064, "grad_norm": 0.19144499037515725, "learning_rate": 7.3108584868678955e-06, "loss": 0.334, "step": 2463 }, { "epoch": 2.6074074074074076, "grad_norm": 0.20970237193504582, "learning_rate": 7.291258330066641e-06, "loss": 0.3803, "step": 2464 }, { "epoch": 2.6084656084656084, "grad_norm": 0.19369116393515454, "learning_rate": 7.271658173265386e-06, "loss": 0.2944, "step": 2465 }, { "epoch": 2.6095238095238096, "grad_norm": 0.20464263215117728, "learning_rate": 7.252058016464132e-06, "loss": 0.3118, "step": 2466 }, { "epoch": 2.610582010582011, "grad_norm": 0.19514819457072263, "learning_rate": 7.232457859662878e-06, "loss": 0.3373, "step": 2467 }, { "epoch": 2.6116402116402115, "grad_norm": 0.2054577341252547, "learning_rate": 7.212857702861624e-06, "loss": 0.3312, "step": 2468 }, { "epoch": 2.6126984126984127, "grad_norm": 0.218003406758833, "learning_rate": 7.193257546060368e-06, "loss": 0.3636, "step": 2469 }, { "epoch": 2.613756613756614, "grad_norm": 0.44213851229624174, "learning_rate": 7.173657389259114e-06, "loss": 0.3976, "step": 2470 }, { "epoch": 2.6148148148148147, "grad_norm": 0.2009534741403274, "learning_rate": 7.15405723245786e-06, "loss": 0.3442, "step": 2471 }, { "epoch": 2.615873015873016, "grad_norm": 0.20912608516036513, "learning_rate": 7.134457075656606e-06, "loss": 0.3854, "step": 2472 }, { "epoch": 2.616931216931217, "grad_norm": 0.20078773548602796, "learning_rate": 7.114856918855352e-06, "loss": 0.3793, "step": 2473 }, { "epoch": 2.617989417989418, "grad_norm": 0.21374608990215083, "learning_rate": 7.095256762054097e-06, "loss": 0.3883, "step": 2474 }, { "epoch": 2.619047619047619, "grad_norm": 0.20326986212714018, "learning_rate": 7.075656605252843e-06, "loss": 0.3634, "step": 2475 }, { "epoch": 2.6201058201058203, "grad_norm": 0.184483550265106, "learning_rate": 7.0560564484515885e-06, "loss": 0.3251, "step": 2476 }, { "epoch": 2.621164021164021, "grad_norm": 0.20518627904667391, "learning_rate": 7.036456291650334e-06, "loss": 0.3383, "step": 2477 }, { "epoch": 2.6222222222222222, "grad_norm": 0.2213975218224379, "learning_rate": 7.0168561348490785e-06, "loss": 0.3805, "step": 2478 }, { "epoch": 2.6232804232804234, "grad_norm": 0.20669880976906882, "learning_rate": 6.997255978047824e-06, "loss": 0.3293, "step": 2479 }, { "epoch": 2.624338624338624, "grad_norm": 0.20421330013118466, "learning_rate": 6.97765582124657e-06, "loss": 0.3308, "step": 2480 }, { "epoch": 2.6253968253968254, "grad_norm": 0.23282007125318938, "learning_rate": 6.958055664445316e-06, "loss": 0.3844, "step": 2481 }, { "epoch": 2.6264550264550266, "grad_norm": 0.20766235706851663, "learning_rate": 6.938455507644062e-06, "loss": 0.3729, "step": 2482 }, { "epoch": 2.6275132275132274, "grad_norm": 0.20961331760975993, "learning_rate": 6.918855350842807e-06, "loss": 0.3733, "step": 2483 }, { "epoch": 2.6285714285714286, "grad_norm": 0.20224319032663668, "learning_rate": 6.899255194041553e-06, "loss": 0.3737, "step": 2484 }, { "epoch": 2.6296296296296298, "grad_norm": 0.17940459522018293, "learning_rate": 6.879655037240299e-06, "loss": 0.3172, "step": 2485 }, { "epoch": 2.6306878306878305, "grad_norm": 0.18842546291187015, "learning_rate": 6.860054880439045e-06, "loss": 0.3237, "step": 2486 }, { "epoch": 2.6317460317460317, "grad_norm": 0.20586867202259435, "learning_rate": 6.840454723637789e-06, "loss": 0.3341, "step": 2487 }, { "epoch": 2.632804232804233, "grad_norm": 0.19471736282988128, "learning_rate": 6.820854566836535e-06, "loss": 0.3179, "step": 2488 }, { "epoch": 2.6338624338624337, "grad_norm": 0.20284155346377497, "learning_rate": 6.801254410035281e-06, "loss": 0.3665, "step": 2489 }, { "epoch": 2.634920634920635, "grad_norm": 0.18879946930455782, "learning_rate": 6.7816542532340265e-06, "loss": 0.3259, "step": 2490 }, { "epoch": 2.635978835978836, "grad_norm": 0.19398530466899666, "learning_rate": 6.762054096432771e-06, "loss": 0.3179, "step": 2491 }, { "epoch": 2.637037037037037, "grad_norm": 0.19234318749673635, "learning_rate": 6.742453939631517e-06, "loss": 0.3446, "step": 2492 }, { "epoch": 2.638095238095238, "grad_norm": 0.21354926873262906, "learning_rate": 6.722853782830263e-06, "loss": 0.3525, "step": 2493 }, { "epoch": 2.6391534391534393, "grad_norm": 0.20334093193943195, "learning_rate": 6.703253626029009e-06, "loss": 0.36, "step": 2494 }, { "epoch": 2.64021164021164, "grad_norm": 0.2035547977347391, "learning_rate": 6.683653469227755e-06, "loss": 0.3542, "step": 2495 }, { "epoch": 2.641269841269841, "grad_norm": 0.19627021534313335, "learning_rate": 6.664053312426499e-06, "loss": 0.334, "step": 2496 }, { "epoch": 2.6423280423280424, "grad_norm": 0.2086234488088156, "learning_rate": 6.644453155625245e-06, "loss": 0.3619, "step": 2497 }, { "epoch": 2.643386243386243, "grad_norm": 0.2201861596648691, "learning_rate": 6.624852998823991e-06, "loss": 0.389, "step": 2498 }, { "epoch": 2.6444444444444444, "grad_norm": 0.20846191600277475, "learning_rate": 6.605252842022737e-06, "loss": 0.3779, "step": 2499 }, { "epoch": 2.6455026455026456, "grad_norm": 0.20734012321675435, "learning_rate": 6.585652685221482e-06, "loss": 0.3508, "step": 2500 }, { "epoch": 2.6465608465608463, "grad_norm": 0.19345863392721405, "learning_rate": 6.566052528420228e-06, "loss": 0.3295, "step": 2501 }, { "epoch": 2.6476190476190475, "grad_norm": 0.20354235487098435, "learning_rate": 6.546452371618974e-06, "loss": 0.376, "step": 2502 }, { "epoch": 2.6486772486772487, "grad_norm": 0.19936131307592084, "learning_rate": 6.5268522148177195e-06, "loss": 0.3372, "step": 2503 }, { "epoch": 2.6497354497354495, "grad_norm": 0.19833040616276873, "learning_rate": 6.507252058016465e-06, "loss": 0.3249, "step": 2504 }, { "epoch": 2.6507936507936507, "grad_norm": 0.22382325734125558, "learning_rate": 6.4876519012152095e-06, "loss": 0.3898, "step": 2505 }, { "epoch": 2.651851851851852, "grad_norm": 0.2086391866302971, "learning_rate": 6.468051744413955e-06, "loss": 0.3822, "step": 2506 }, { "epoch": 2.6529100529100527, "grad_norm": 0.19118652570324526, "learning_rate": 6.448451587612701e-06, "loss": 0.3232, "step": 2507 }, { "epoch": 2.653968253968254, "grad_norm": 0.1992916481134372, "learning_rate": 6.428851430811447e-06, "loss": 0.3596, "step": 2508 }, { "epoch": 2.655026455026455, "grad_norm": 0.20052752929381737, "learning_rate": 6.409251274010192e-06, "loss": 0.3474, "step": 2509 }, { "epoch": 2.656084656084656, "grad_norm": 0.1892696179940764, "learning_rate": 6.389651117208938e-06, "loss": 0.3491, "step": 2510 }, { "epoch": 2.657142857142857, "grad_norm": 0.18821467544895704, "learning_rate": 6.370050960407684e-06, "loss": 0.3406, "step": 2511 }, { "epoch": 2.6582010582010582, "grad_norm": 0.2146316899839704, "learning_rate": 6.35045080360643e-06, "loss": 0.3268, "step": 2512 }, { "epoch": 2.659259259259259, "grad_norm": 0.1924428845095873, "learning_rate": 6.330850646805176e-06, "loss": 0.3278, "step": 2513 }, { "epoch": 2.66031746031746, "grad_norm": 0.20147304345647865, "learning_rate": 6.31125049000392e-06, "loss": 0.3452, "step": 2514 }, { "epoch": 2.6613756613756614, "grad_norm": 0.18452957282061885, "learning_rate": 6.291650333202666e-06, "loss": 0.3281, "step": 2515 }, { "epoch": 2.6624338624338626, "grad_norm": 0.1857986422958092, "learning_rate": 6.272050176401412e-06, "loss": 0.3101, "step": 2516 }, { "epoch": 2.6634920634920634, "grad_norm": 0.21536712844728778, "learning_rate": 6.2524500196001575e-06, "loss": 0.3835, "step": 2517 }, { "epoch": 2.6645502645502646, "grad_norm": 0.19196672833526432, "learning_rate": 6.2328498627989025e-06, "loss": 0.331, "step": 2518 }, { "epoch": 2.6656084656084658, "grad_norm": 0.20346805393197445, "learning_rate": 6.213249705997648e-06, "loss": 0.3718, "step": 2519 }, { "epoch": 2.6666666666666665, "grad_norm": 0.19865890016978405, "learning_rate": 6.193649549196394e-06, "loss": 0.3476, "step": 2520 }, { "epoch": 2.6677248677248677, "grad_norm": 0.1868360318396176, "learning_rate": 6.17404939239514e-06, "loss": 0.3461, "step": 2521 }, { "epoch": 2.668783068783069, "grad_norm": 0.18283098776282525, "learning_rate": 6.154449235593885e-06, "loss": 0.3185, "step": 2522 }, { "epoch": 2.6698412698412697, "grad_norm": 0.19611612634805634, "learning_rate": 6.134849078792631e-06, "loss": 0.3348, "step": 2523 }, { "epoch": 2.670899470899471, "grad_norm": 0.19050047780185728, "learning_rate": 6.115248921991376e-06, "loss": 0.3277, "step": 2524 }, { "epoch": 2.671957671957672, "grad_norm": 0.2084510520678467, "learning_rate": 6.095648765190122e-06, "loss": 0.3225, "step": 2525 }, { "epoch": 2.6730158730158733, "grad_norm": 0.1922454758562699, "learning_rate": 6.076048608388867e-06, "loss": 0.3602, "step": 2526 }, { "epoch": 2.674074074074074, "grad_norm": 0.19098409079480982, "learning_rate": 6.056448451587613e-06, "loss": 0.3501, "step": 2527 }, { "epoch": 2.6751322751322753, "grad_norm": 0.19240132190670328, "learning_rate": 6.036848294786359e-06, "loss": 0.3455, "step": 2528 }, { "epoch": 2.6761904761904765, "grad_norm": 0.18234297191193383, "learning_rate": 6.017248137985105e-06, "loss": 0.3248, "step": 2529 }, { "epoch": 2.677248677248677, "grad_norm": 0.18076554021932248, "learning_rate": 5.99764798118385e-06, "loss": 0.3065, "step": 2530 }, { "epoch": 2.6783068783068784, "grad_norm": 0.1929979054056131, "learning_rate": 5.9780478243825955e-06, "loss": 0.3604, "step": 2531 }, { "epoch": 2.6793650793650796, "grad_norm": 0.20221424529505985, "learning_rate": 5.958447667581341e-06, "loss": 0.3857, "step": 2532 }, { "epoch": 2.6804232804232804, "grad_norm": 0.20020493436720654, "learning_rate": 5.938847510780086e-06, "loss": 0.3353, "step": 2533 }, { "epoch": 2.6814814814814816, "grad_norm": 0.1979878253122416, "learning_rate": 5.919247353978832e-06, "loss": 0.3524, "step": 2534 }, { "epoch": 2.682539682539683, "grad_norm": 0.1878994433039551, "learning_rate": 5.899647197177577e-06, "loss": 0.3234, "step": 2535 }, { "epoch": 2.6835978835978835, "grad_norm": 0.20303922395745672, "learning_rate": 5.880047040376323e-06, "loss": 0.3472, "step": 2536 }, { "epoch": 2.6846560846560847, "grad_norm": 0.1964831514888397, "learning_rate": 5.860446883575069e-06, "loss": 0.3512, "step": 2537 }, { "epoch": 2.685714285714286, "grad_norm": 0.20050158418913078, "learning_rate": 5.840846726773815e-06, "loss": 0.368, "step": 2538 }, { "epoch": 2.6867724867724867, "grad_norm": 0.19809108859289698, "learning_rate": 5.82124656997256e-06, "loss": 0.3642, "step": 2539 }, { "epoch": 2.687830687830688, "grad_norm": 0.19542084982885585, "learning_rate": 5.801646413171306e-06, "loss": 0.3505, "step": 2540 }, { "epoch": 2.688888888888889, "grad_norm": 0.25026144659287575, "learning_rate": 5.782046256370052e-06, "loss": 0.3564, "step": 2541 }, { "epoch": 2.68994708994709, "grad_norm": 0.19368060067900453, "learning_rate": 5.762446099568797e-06, "loss": 0.3444, "step": 2542 }, { "epoch": 2.691005291005291, "grad_norm": 0.2015994007513886, "learning_rate": 5.742845942767543e-06, "loss": 0.3331, "step": 2543 }, { "epoch": 2.6920634920634923, "grad_norm": 0.20202195926051253, "learning_rate": 5.723245785966288e-06, "loss": 0.3452, "step": 2544 }, { "epoch": 2.693121693121693, "grad_norm": 0.18149608421071714, "learning_rate": 5.7036456291650335e-06, "loss": 0.2989, "step": 2545 }, { "epoch": 2.6941798941798942, "grad_norm": 0.21151146883978197, "learning_rate": 5.684045472363779e-06, "loss": 0.3899, "step": 2546 }, { "epoch": 2.6952380952380954, "grad_norm": 0.19106657712419947, "learning_rate": 5.664445315562525e-06, "loss": 0.3133, "step": 2547 }, { "epoch": 2.696296296296296, "grad_norm": 0.204486097348059, "learning_rate": 5.64484515876127e-06, "loss": 0.3243, "step": 2548 }, { "epoch": 2.6973544973544974, "grad_norm": 0.2120307771160055, "learning_rate": 5.625245001960016e-06, "loss": 0.3616, "step": 2549 }, { "epoch": 2.6984126984126986, "grad_norm": 0.17095776864486206, "learning_rate": 5.605644845158761e-06, "loss": 0.2899, "step": 2550 }, { "epoch": 2.6994708994708994, "grad_norm": 0.21843454968847806, "learning_rate": 5.586044688357507e-06, "loss": 0.3463, "step": 2551 }, { "epoch": 2.7005291005291006, "grad_norm": 0.19118514569525785, "learning_rate": 5.566444531556253e-06, "loss": 0.3216, "step": 2552 }, { "epoch": 2.7015873015873018, "grad_norm": 0.25238654920190934, "learning_rate": 5.546844374754998e-06, "loss": 0.3795, "step": 2553 }, { "epoch": 2.7026455026455025, "grad_norm": 0.20643432168257772, "learning_rate": 5.527244217953744e-06, "loss": 0.366, "step": 2554 }, { "epoch": 2.7037037037037037, "grad_norm": 0.1951728012706216, "learning_rate": 5.50764406115249e-06, "loss": 0.3211, "step": 2555 }, { "epoch": 2.704761904761905, "grad_norm": 0.19406462105051225, "learning_rate": 5.488043904351236e-06, "loss": 0.3226, "step": 2556 }, { "epoch": 2.7058201058201057, "grad_norm": 0.19140790364896687, "learning_rate": 5.468443747549981e-06, "loss": 0.32, "step": 2557 }, { "epoch": 2.706878306878307, "grad_norm": 0.19071360052522665, "learning_rate": 5.4488435907487265e-06, "loss": 0.3619, "step": 2558 }, { "epoch": 2.707936507936508, "grad_norm": 0.2066029798654874, "learning_rate": 5.4292434339474715e-06, "loss": 0.37, "step": 2559 }, { "epoch": 2.708994708994709, "grad_norm": 0.2000206802025288, "learning_rate": 5.409643277146217e-06, "loss": 0.3739, "step": 2560 }, { "epoch": 2.71005291005291, "grad_norm": 0.1992451517241142, "learning_rate": 5.390043120344963e-06, "loss": 0.2976, "step": 2561 }, { "epoch": 2.7111111111111112, "grad_norm": 0.21041064516580602, "learning_rate": 5.370442963543708e-06, "loss": 0.3372, "step": 2562 }, { "epoch": 2.712169312169312, "grad_norm": 0.1965382228229818, "learning_rate": 5.350842806742454e-06, "loss": 0.3181, "step": 2563 }, { "epoch": 2.713227513227513, "grad_norm": 0.19553975072971144, "learning_rate": 5.3312426499412e-06, "loss": 0.3334, "step": 2564 }, { "epoch": 2.7142857142857144, "grad_norm": 0.182293586725453, "learning_rate": 5.311642493139946e-06, "loss": 0.312, "step": 2565 }, { "epoch": 2.715343915343915, "grad_norm": 0.22281231749006977, "learning_rate": 5.292042336338691e-06, "loss": 0.4265, "step": 2566 }, { "epoch": 2.7164021164021164, "grad_norm": 0.19852030519721917, "learning_rate": 5.272442179537437e-06, "loss": 0.3736, "step": 2567 }, { "epoch": 2.7174603174603176, "grad_norm": 0.20060353048459667, "learning_rate": 5.252842022736182e-06, "loss": 0.3608, "step": 2568 }, { "epoch": 2.7185185185185183, "grad_norm": 0.2043124213401307, "learning_rate": 5.233241865934928e-06, "loss": 0.3695, "step": 2569 }, { "epoch": 2.7195767195767195, "grad_norm": 0.21279444075120416, "learning_rate": 5.213641709133673e-06, "loss": 0.3461, "step": 2570 }, { "epoch": 2.7206349206349207, "grad_norm": 0.1962353653907673, "learning_rate": 5.194041552332419e-06, "loss": 0.3479, "step": 2571 }, { "epoch": 2.7216931216931215, "grad_norm": 0.19136776040919806, "learning_rate": 5.1744413955311645e-06, "loss": 0.3381, "step": 2572 }, { "epoch": 2.7227513227513227, "grad_norm": 0.19168537594384008, "learning_rate": 5.15484123872991e-06, "loss": 0.332, "step": 2573 }, { "epoch": 2.723809523809524, "grad_norm": 0.18822366734415621, "learning_rate": 5.135241081928656e-06, "loss": 0.3287, "step": 2574 }, { "epoch": 2.7248677248677247, "grad_norm": 0.2077200076431067, "learning_rate": 5.115640925127401e-06, "loss": 0.3431, "step": 2575 }, { "epoch": 2.725925925925926, "grad_norm": 0.19269703180103093, "learning_rate": 5.096040768326147e-06, "loss": 0.3086, "step": 2576 }, { "epoch": 2.726984126984127, "grad_norm": 0.18860716399222538, "learning_rate": 5.076440611524892e-06, "loss": 0.3154, "step": 2577 }, { "epoch": 2.728042328042328, "grad_norm": 0.1958899798494481, "learning_rate": 5.056840454723638e-06, "loss": 0.3111, "step": 2578 }, { "epoch": 2.729100529100529, "grad_norm": 0.1820101679127598, "learning_rate": 5.037240297922383e-06, "loss": 0.3145, "step": 2579 }, { "epoch": 2.7301587301587302, "grad_norm": 0.1983802921074873, "learning_rate": 5.017640141121129e-06, "loss": 0.3595, "step": 2580 }, { "epoch": 2.731216931216931, "grad_norm": 0.19970794198008765, "learning_rate": 4.998039984319875e-06, "loss": 0.3572, "step": 2581 }, { "epoch": 2.732275132275132, "grad_norm": 0.1893406404078852, "learning_rate": 4.978439827518621e-06, "loss": 0.3426, "step": 2582 }, { "epoch": 2.7333333333333334, "grad_norm": 0.20404466163446328, "learning_rate": 4.9588396707173666e-06, "loss": 0.3658, "step": 2583 }, { "epoch": 2.734391534391534, "grad_norm": 0.18412114768156596, "learning_rate": 4.939239513916112e-06, "loss": 0.3046, "step": 2584 }, { "epoch": 2.7354497354497354, "grad_norm": 0.2075384994168872, "learning_rate": 4.9196393571148575e-06, "loss": 0.338, "step": 2585 }, { "epoch": 2.7365079365079366, "grad_norm": 0.22412000158196252, "learning_rate": 4.9000392003136025e-06, "loss": 0.4155, "step": 2586 }, { "epoch": 2.7375661375661373, "grad_norm": 0.20401992100764668, "learning_rate": 4.880439043512348e-06, "loss": 0.3865, "step": 2587 }, { "epoch": 2.7386243386243385, "grad_norm": 0.18604920498095107, "learning_rate": 4.860838886711093e-06, "loss": 0.3134, "step": 2588 }, { "epoch": 2.7396825396825397, "grad_norm": 0.21195152122045988, "learning_rate": 4.841238729909839e-06, "loss": 0.3833, "step": 2589 }, { "epoch": 2.7407407407407405, "grad_norm": 0.19793521415832854, "learning_rate": 4.821638573108585e-06, "loss": 0.3654, "step": 2590 }, { "epoch": 2.7417989417989417, "grad_norm": 0.19239288309471586, "learning_rate": 4.802038416307331e-06, "loss": 0.3311, "step": 2591 }, { "epoch": 2.742857142857143, "grad_norm": 0.18478342433128606, "learning_rate": 4.782438259506077e-06, "loss": 0.3196, "step": 2592 }, { "epoch": 2.7439153439153436, "grad_norm": 0.19737300368776284, "learning_rate": 4.762838102704822e-06, "loss": 0.3399, "step": 2593 }, { "epoch": 2.744973544973545, "grad_norm": 0.2044197261835309, "learning_rate": 4.743237945903568e-06, "loss": 0.3874, "step": 2594 }, { "epoch": 2.746031746031746, "grad_norm": 0.2160010979562428, "learning_rate": 4.723637789102313e-06, "loss": 0.385, "step": 2595 }, { "epoch": 2.7470899470899472, "grad_norm": 0.18127732926658155, "learning_rate": 4.704037632301059e-06, "loss": 0.3192, "step": 2596 }, { "epoch": 2.748148148148148, "grad_norm": 0.18852738977755046, "learning_rate": 4.684437475499804e-06, "loss": 0.3439, "step": 2597 }, { "epoch": 2.749206349206349, "grad_norm": 0.17640615561096873, "learning_rate": 4.66483731869855e-06, "loss": 0.2992, "step": 2598 }, { "epoch": 2.7502645502645504, "grad_norm": 0.20969394677698452, "learning_rate": 4.6452371618972955e-06, "loss": 0.4039, "step": 2599 }, { "epoch": 2.751322751322751, "grad_norm": 0.1938761616618117, "learning_rate": 4.625637005096041e-06, "loss": 0.3456, "step": 2600 }, { "epoch": 2.7523809523809524, "grad_norm": 0.20065804261742967, "learning_rate": 4.606036848294787e-06, "loss": 0.379, "step": 2601 }, { "epoch": 2.7534391534391536, "grad_norm": 0.18022036699122285, "learning_rate": 4.586436691493532e-06, "loss": 0.2838, "step": 2602 }, { "epoch": 2.7544973544973543, "grad_norm": 0.20021693200514237, "learning_rate": 4.566836534692278e-06, "loss": 0.3744, "step": 2603 }, { "epoch": 2.7555555555555555, "grad_norm": 0.19141878650651184, "learning_rate": 4.547236377891023e-06, "loss": 0.2937, "step": 2604 }, { "epoch": 2.7566137566137567, "grad_norm": 0.2529365210436403, "learning_rate": 4.527636221089769e-06, "loss": 0.3638, "step": 2605 }, { "epoch": 2.757671957671958, "grad_norm": 0.1854468401388474, "learning_rate": 4.508036064288514e-06, "loss": 0.3123, "step": 2606 }, { "epoch": 2.7587301587301587, "grad_norm": 0.20565208226029344, "learning_rate": 4.48843590748726e-06, "loss": 0.3706, "step": 2607 }, { "epoch": 2.75978835978836, "grad_norm": 0.19044667922854414, "learning_rate": 4.468835750686006e-06, "loss": 0.3196, "step": 2608 }, { "epoch": 2.760846560846561, "grad_norm": 0.19238853879054066, "learning_rate": 4.449235593884752e-06, "loss": 0.3148, "step": 2609 }, { "epoch": 2.761904761904762, "grad_norm": 0.1956487628537959, "learning_rate": 4.4296354370834976e-06, "loss": 0.3521, "step": 2610 }, { "epoch": 2.762962962962963, "grad_norm": 0.20209286152850098, "learning_rate": 4.410035280282243e-06, "loss": 0.3717, "step": 2611 }, { "epoch": 2.7640211640211643, "grad_norm": 0.19340479563705504, "learning_rate": 4.3904351234809885e-06, "loss": 0.3598, "step": 2612 }, { "epoch": 2.765079365079365, "grad_norm": 0.19526534766192782, "learning_rate": 4.3708349666797335e-06, "loss": 0.3709, "step": 2613 }, { "epoch": 2.7661375661375662, "grad_norm": 0.1876434539839283, "learning_rate": 4.351234809878479e-06, "loss": 0.3356, "step": 2614 }, { "epoch": 2.7671957671957674, "grad_norm": 0.18913882028697904, "learning_rate": 4.331634653077224e-06, "loss": 0.3331, "step": 2615 }, { "epoch": 2.768253968253968, "grad_norm": 0.18531612005736103, "learning_rate": 4.31203449627597e-06, "loss": 0.3121, "step": 2616 }, { "epoch": 2.7693121693121694, "grad_norm": 0.18210121950128233, "learning_rate": 4.292434339474716e-06, "loss": 0.3217, "step": 2617 }, { "epoch": 2.7703703703703706, "grad_norm": 0.17631791372733657, "learning_rate": 4.272834182673462e-06, "loss": 0.3145, "step": 2618 }, { "epoch": 2.7714285714285714, "grad_norm": 0.2052048552010276, "learning_rate": 4.253234025872207e-06, "loss": 0.3445, "step": 2619 }, { "epoch": 2.7724867724867726, "grad_norm": 0.20391385871973938, "learning_rate": 4.233633869070953e-06, "loss": 0.3821, "step": 2620 }, { "epoch": 2.7735449735449738, "grad_norm": 0.19379016138412378, "learning_rate": 4.214033712269699e-06, "loss": 0.3808, "step": 2621 }, { "epoch": 2.7746031746031745, "grad_norm": 0.18659539827425273, "learning_rate": 4.194433555468444e-06, "loss": 0.3072, "step": 2622 }, { "epoch": 2.7756613756613757, "grad_norm": 0.1894212818459714, "learning_rate": 4.17483339866719e-06, "loss": 0.3268, "step": 2623 }, { "epoch": 2.776719576719577, "grad_norm": 0.190978704730525, "learning_rate": 4.155233241865935e-06, "loss": 0.3382, "step": 2624 }, { "epoch": 2.7777777777777777, "grad_norm": 0.20109476739966778, "learning_rate": 4.135633085064681e-06, "loss": 0.3827, "step": 2625 }, { "epoch": 2.778835978835979, "grad_norm": 0.19363672859817238, "learning_rate": 4.1160329282634265e-06, "loss": 0.3587, "step": 2626 }, { "epoch": 2.77989417989418, "grad_norm": 0.2010695366446098, "learning_rate": 4.096432771462172e-06, "loss": 0.3657, "step": 2627 }, { "epoch": 2.780952380952381, "grad_norm": 0.1910110293287592, "learning_rate": 4.076832614660917e-06, "loss": 0.3155, "step": 2628 }, { "epoch": 2.782010582010582, "grad_norm": 0.19681762572714664, "learning_rate": 4.057232457859663e-06, "loss": 0.3781, "step": 2629 }, { "epoch": 2.7830687830687832, "grad_norm": 0.2038735269817466, "learning_rate": 4.037632301058409e-06, "loss": 0.3807, "step": 2630 }, { "epoch": 2.784126984126984, "grad_norm": 0.19777937024940856, "learning_rate": 4.018032144257154e-06, "loss": 0.3569, "step": 2631 }, { "epoch": 2.785185185185185, "grad_norm": 0.1995201890239187, "learning_rate": 3.9984319874559e-06, "loss": 0.3701, "step": 2632 }, { "epoch": 2.7862433862433864, "grad_norm": 0.20317171069691647, "learning_rate": 3.978831830654645e-06, "loss": 0.3569, "step": 2633 }, { "epoch": 2.787301587301587, "grad_norm": 0.19068734602892104, "learning_rate": 3.959231673853391e-06, "loss": 0.3366, "step": 2634 }, { "epoch": 2.7883597883597884, "grad_norm": 0.1828626835621772, "learning_rate": 3.939631517052137e-06, "loss": 0.3116, "step": 2635 }, { "epoch": 2.7894179894179896, "grad_norm": 0.19548905750250636, "learning_rate": 3.920031360250883e-06, "loss": 0.3472, "step": 2636 }, { "epoch": 2.7904761904761903, "grad_norm": 0.19023591210044766, "learning_rate": 3.900431203449628e-06, "loss": 0.3046, "step": 2637 }, { "epoch": 2.7915343915343915, "grad_norm": 0.24034345295992002, "learning_rate": 3.880831046648374e-06, "loss": 0.402, "step": 2638 }, { "epoch": 2.7925925925925927, "grad_norm": 0.1934131985141875, "learning_rate": 3.861230889847119e-06, "loss": 0.3431, "step": 2639 }, { "epoch": 2.7936507936507935, "grad_norm": 0.19510076751913827, "learning_rate": 3.8416307330458645e-06, "loss": 0.3473, "step": 2640 }, { "epoch": 2.7947089947089947, "grad_norm": 0.20150653464491855, "learning_rate": 3.82203057624461e-06, "loss": 0.3688, "step": 2641 }, { "epoch": 2.795767195767196, "grad_norm": 0.18941533891094486, "learning_rate": 3.802430419443356e-06, "loss": 0.3482, "step": 2642 }, { "epoch": 2.7968253968253967, "grad_norm": 0.20133005930916215, "learning_rate": 3.7828302626421017e-06, "loss": 0.3549, "step": 2643 }, { "epoch": 2.797883597883598, "grad_norm": 0.20525961667412126, "learning_rate": 3.7632301058408467e-06, "loss": 0.3616, "step": 2644 }, { "epoch": 2.798941798941799, "grad_norm": 0.19156388319779546, "learning_rate": 3.7436299490395926e-06, "loss": 0.3417, "step": 2645 }, { "epoch": 2.8, "grad_norm": 0.2011256721429364, "learning_rate": 3.724029792238338e-06, "loss": 0.3601, "step": 2646 }, { "epoch": 2.801058201058201, "grad_norm": 0.19619574219995886, "learning_rate": 3.704429635437084e-06, "loss": 0.3485, "step": 2647 }, { "epoch": 2.8021164021164022, "grad_norm": 0.19499163232482158, "learning_rate": 3.684829478635829e-06, "loss": 0.3725, "step": 2648 }, { "epoch": 2.803174603174603, "grad_norm": 0.19294474196059933, "learning_rate": 3.665229321834575e-06, "loss": 0.3555, "step": 2649 }, { "epoch": 2.804232804232804, "grad_norm": 0.2032516666422444, "learning_rate": 3.6456291650333207e-06, "loss": 0.3661, "step": 2650 }, { "epoch": 2.8052910052910054, "grad_norm": 0.2020734831226216, "learning_rate": 3.626029008232066e-06, "loss": 0.3556, "step": 2651 }, { "epoch": 2.806349206349206, "grad_norm": 0.1916619404390491, "learning_rate": 3.606428851430812e-06, "loss": 0.3329, "step": 2652 }, { "epoch": 2.8074074074074074, "grad_norm": 0.1952962673766634, "learning_rate": 3.586828694629557e-06, "loss": 0.3554, "step": 2653 }, { "epoch": 2.8084656084656086, "grad_norm": 0.1962712621825182, "learning_rate": 3.567228537828303e-06, "loss": 0.3574, "step": 2654 }, { "epoch": 2.8095238095238093, "grad_norm": 0.1844963945964023, "learning_rate": 3.5476283810270484e-06, "loss": 0.3231, "step": 2655 }, { "epoch": 2.8105820105820105, "grad_norm": 0.18348101787536167, "learning_rate": 3.5280282242257942e-06, "loss": 0.3159, "step": 2656 }, { "epoch": 2.8116402116402117, "grad_norm": 0.1849348467642649, "learning_rate": 3.5084280674245393e-06, "loss": 0.3231, "step": 2657 }, { "epoch": 2.8126984126984125, "grad_norm": 0.18599313201602283, "learning_rate": 3.488827910623285e-06, "loss": 0.3381, "step": 2658 }, { "epoch": 2.8137566137566137, "grad_norm": 0.18221416638152477, "learning_rate": 3.469227753822031e-06, "loss": 0.3136, "step": 2659 }, { "epoch": 2.814814814814815, "grad_norm": 0.18255639179737698, "learning_rate": 3.4496275970207765e-06, "loss": 0.322, "step": 2660 }, { "epoch": 2.8158730158730156, "grad_norm": 0.2095837906111075, "learning_rate": 3.4300274402195223e-06, "loss": 0.3377, "step": 2661 }, { "epoch": 2.816931216931217, "grad_norm": 0.18665268042560523, "learning_rate": 3.4104272834182674e-06, "loss": 0.3431, "step": 2662 }, { "epoch": 2.817989417989418, "grad_norm": 0.19002223272665303, "learning_rate": 3.3908271266170132e-06, "loss": 0.3342, "step": 2663 }, { "epoch": 2.819047619047619, "grad_norm": 0.18705897801580285, "learning_rate": 3.3712269698157587e-06, "loss": 0.3181, "step": 2664 }, { "epoch": 2.82010582010582, "grad_norm": 0.20862688975705004, "learning_rate": 3.3516268130145046e-06, "loss": 0.3668, "step": 2665 }, { "epoch": 2.821164021164021, "grad_norm": 0.19225010946678575, "learning_rate": 3.3320266562132496e-06, "loss": 0.3359, "step": 2666 }, { "epoch": 2.822222222222222, "grad_norm": 0.20161732029347787, "learning_rate": 3.3124264994119955e-06, "loss": 0.3367, "step": 2667 }, { "epoch": 2.823280423280423, "grad_norm": 0.18828704734970103, "learning_rate": 3.292826342610741e-06, "loss": 0.3094, "step": 2668 }, { "epoch": 2.8243386243386244, "grad_norm": 0.18924625950342489, "learning_rate": 3.273226185809487e-06, "loss": 0.3434, "step": 2669 }, { "epoch": 2.825396825396825, "grad_norm": 0.18417340787076836, "learning_rate": 3.2536260290082327e-06, "loss": 0.3303, "step": 2670 }, { "epoch": 2.8264550264550263, "grad_norm": 0.20379631500237214, "learning_rate": 3.2340258722069777e-06, "loss": 0.3635, "step": 2671 }, { "epoch": 2.8275132275132275, "grad_norm": 0.19838887354654922, "learning_rate": 3.2144257154057236e-06, "loss": 0.3639, "step": 2672 }, { "epoch": 2.8285714285714287, "grad_norm": 0.19794810913649097, "learning_rate": 3.194825558604469e-06, "loss": 0.3438, "step": 2673 }, { "epoch": 2.8296296296296295, "grad_norm": 0.185510630825901, "learning_rate": 3.175225401803215e-06, "loss": 0.3467, "step": 2674 }, { "epoch": 2.8306878306878307, "grad_norm": 0.192065774599022, "learning_rate": 3.15562524500196e-06, "loss": 0.3375, "step": 2675 }, { "epoch": 2.831746031746032, "grad_norm": 0.19435756312714894, "learning_rate": 3.136025088200706e-06, "loss": 0.3493, "step": 2676 }, { "epoch": 2.8328042328042327, "grad_norm": 0.2009853526245991, "learning_rate": 3.1164249313994513e-06, "loss": 0.3707, "step": 2677 }, { "epoch": 2.833862433862434, "grad_norm": 0.1744316971987279, "learning_rate": 3.096824774598197e-06, "loss": 0.2686, "step": 2678 }, { "epoch": 2.834920634920635, "grad_norm": 0.19758049236544187, "learning_rate": 3.0772246177969426e-06, "loss": 0.3461, "step": 2679 }, { "epoch": 2.835978835978836, "grad_norm": 0.19599350952580494, "learning_rate": 3.057624460995688e-06, "loss": 0.3551, "step": 2680 }, { "epoch": 2.837037037037037, "grad_norm": 0.18930967657748698, "learning_rate": 3.0380243041944335e-06, "loss": 0.3696, "step": 2681 }, { "epoch": 2.8380952380952382, "grad_norm": 0.20202839534587627, "learning_rate": 3.0184241473931794e-06, "loss": 0.3899, "step": 2682 }, { "epoch": 2.8391534391534394, "grad_norm": 0.1889193752419766, "learning_rate": 2.998823990591925e-06, "loss": 0.3159, "step": 2683 }, { "epoch": 2.84021164021164, "grad_norm": 0.1911231246544652, "learning_rate": 2.9792238337906707e-06, "loss": 0.3272, "step": 2684 }, { "epoch": 2.8412698412698414, "grad_norm": 0.20056244525036598, "learning_rate": 2.959623676989416e-06, "loss": 0.3516, "step": 2685 }, { "epoch": 2.8423280423280426, "grad_norm": 0.19474804822394667, "learning_rate": 2.9400235201881616e-06, "loss": 0.3619, "step": 2686 }, { "epoch": 2.8433862433862434, "grad_norm": 0.1922225375564539, "learning_rate": 2.9204233633869075e-06, "loss": 0.3583, "step": 2687 }, { "epoch": 2.8444444444444446, "grad_norm": 0.2114409910111819, "learning_rate": 2.900823206585653e-06, "loss": 0.3864, "step": 2688 }, { "epoch": 2.8455026455026458, "grad_norm": 0.18768776804906878, "learning_rate": 2.8812230497843984e-06, "loss": 0.3316, "step": 2689 }, { "epoch": 2.8465608465608465, "grad_norm": 0.19630860306480566, "learning_rate": 2.861622892983144e-06, "loss": 0.3418, "step": 2690 }, { "epoch": 2.8476190476190477, "grad_norm": 0.18278700681759866, "learning_rate": 2.8420227361818897e-06, "loss": 0.3305, "step": 2691 }, { "epoch": 2.848677248677249, "grad_norm": 0.1828410519755495, "learning_rate": 2.822422579380635e-06, "loss": 0.3093, "step": 2692 }, { "epoch": 2.8497354497354497, "grad_norm": 0.19191081583592245, "learning_rate": 2.8028224225793806e-06, "loss": 0.3178, "step": 2693 }, { "epoch": 2.850793650793651, "grad_norm": 0.19137029288992774, "learning_rate": 2.7832222657781265e-06, "loss": 0.3494, "step": 2694 }, { "epoch": 2.851851851851852, "grad_norm": 0.18943110594155188, "learning_rate": 2.763622108976872e-06, "loss": 0.346, "step": 2695 }, { "epoch": 2.852910052910053, "grad_norm": 0.20066459093560637, "learning_rate": 2.744021952175618e-06, "loss": 0.3665, "step": 2696 }, { "epoch": 2.853968253968254, "grad_norm": 0.20097973469627337, "learning_rate": 2.7244217953743632e-06, "loss": 0.3853, "step": 2697 }, { "epoch": 2.8550264550264552, "grad_norm": 0.19008839219881032, "learning_rate": 2.7048216385731087e-06, "loss": 0.346, "step": 2698 }, { "epoch": 2.856084656084656, "grad_norm": 0.17411703036920584, "learning_rate": 2.685221481771854e-06, "loss": 0.3064, "step": 2699 }, { "epoch": 2.857142857142857, "grad_norm": 0.19049701322996757, "learning_rate": 2.6656213249706e-06, "loss": 0.3472, "step": 2700 }, { "epoch": 2.8582010582010584, "grad_norm": 0.17636675029171775, "learning_rate": 2.6460211681693455e-06, "loss": 0.3013, "step": 2701 }, { "epoch": 2.859259259259259, "grad_norm": 0.18713094779664108, "learning_rate": 2.626421011368091e-06, "loss": 0.332, "step": 2702 }, { "epoch": 2.8603174603174604, "grad_norm": 0.1840880400812757, "learning_rate": 2.6068208545668364e-06, "loss": 0.3173, "step": 2703 }, { "epoch": 2.8613756613756616, "grad_norm": 0.18365151688406114, "learning_rate": 2.5872206977655822e-06, "loss": 0.3351, "step": 2704 }, { "epoch": 2.8624338624338623, "grad_norm": 0.19835551815444752, "learning_rate": 2.567620540964328e-06, "loss": 0.3591, "step": 2705 }, { "epoch": 2.8634920634920635, "grad_norm": 0.1821324213120818, "learning_rate": 2.5480203841630736e-06, "loss": 0.3202, "step": 2706 }, { "epoch": 2.8645502645502647, "grad_norm": 0.18706640843666805, "learning_rate": 2.528420227361819e-06, "loss": 0.3202, "step": 2707 }, { "epoch": 2.8656084656084655, "grad_norm": 0.1890910955252759, "learning_rate": 2.5088200705605645e-06, "loss": 0.3466, "step": 2708 }, { "epoch": 2.8666666666666667, "grad_norm": 0.19715376592862416, "learning_rate": 2.4892199137593104e-06, "loss": 0.3587, "step": 2709 }, { "epoch": 2.867724867724868, "grad_norm": 0.1929758625937477, "learning_rate": 2.469619756958056e-06, "loss": 0.3364, "step": 2710 }, { "epoch": 2.8687830687830687, "grad_norm": 0.19274028108990232, "learning_rate": 2.4500196001568013e-06, "loss": 0.3349, "step": 2711 }, { "epoch": 2.86984126984127, "grad_norm": 0.18700906272661122, "learning_rate": 2.4304194433555467e-06, "loss": 0.3221, "step": 2712 }, { "epoch": 2.870899470899471, "grad_norm": 0.1815493059168304, "learning_rate": 2.4108192865542926e-06, "loss": 0.3283, "step": 2713 }, { "epoch": 2.871957671957672, "grad_norm": 0.19775169552678987, "learning_rate": 2.3912191297530385e-06, "loss": 0.3233, "step": 2714 }, { "epoch": 2.873015873015873, "grad_norm": 0.2659234152993213, "learning_rate": 2.371618972951784e-06, "loss": 0.3772, "step": 2715 }, { "epoch": 2.8740740740740742, "grad_norm": 0.2176181455659064, "learning_rate": 2.3520188161505294e-06, "loss": 0.3911, "step": 2716 }, { "epoch": 2.875132275132275, "grad_norm": 0.17472137810214414, "learning_rate": 2.332418659349275e-06, "loss": 0.2846, "step": 2717 }, { "epoch": 2.876190476190476, "grad_norm": 0.19723914254783917, "learning_rate": 2.3128185025480207e-06, "loss": 0.3504, "step": 2718 }, { "epoch": 2.8772486772486774, "grad_norm": 0.18663011406982238, "learning_rate": 2.293218345746766e-06, "loss": 0.3262, "step": 2719 }, { "epoch": 2.878306878306878, "grad_norm": 0.19137413050629865, "learning_rate": 2.2736181889455116e-06, "loss": 0.3648, "step": 2720 }, { "epoch": 2.8793650793650793, "grad_norm": 0.19471470387541595, "learning_rate": 2.254018032144257e-06, "loss": 0.3447, "step": 2721 }, { "epoch": 2.8804232804232806, "grad_norm": 0.18568099963810475, "learning_rate": 2.234417875343003e-06, "loss": 0.3104, "step": 2722 }, { "epoch": 2.8814814814814813, "grad_norm": 0.17907029988352577, "learning_rate": 2.2148177185417488e-06, "loss": 0.3088, "step": 2723 }, { "epoch": 2.8825396825396825, "grad_norm": 0.18879588257488686, "learning_rate": 2.1952175617404942e-06, "loss": 0.362, "step": 2724 }, { "epoch": 2.8835978835978837, "grad_norm": 0.17652030997033405, "learning_rate": 2.1756174049392397e-06, "loss": 0.3012, "step": 2725 }, { "epoch": 2.8846560846560845, "grad_norm": 0.1763918711495294, "learning_rate": 2.156017248137985e-06, "loss": 0.2995, "step": 2726 }, { "epoch": 2.8857142857142857, "grad_norm": 0.2041375473388221, "learning_rate": 2.136417091336731e-06, "loss": 0.3903, "step": 2727 }, { "epoch": 2.886772486772487, "grad_norm": 0.1799297516998756, "learning_rate": 2.1168169345354765e-06, "loss": 0.3152, "step": 2728 }, { "epoch": 2.8878306878306876, "grad_norm": 0.1995448891297022, "learning_rate": 2.097216777734222e-06, "loss": 0.3903, "step": 2729 }, { "epoch": 2.888888888888889, "grad_norm": 0.19215679672923563, "learning_rate": 2.0776166209329674e-06, "loss": 0.3628, "step": 2730 }, { "epoch": 2.88994708994709, "grad_norm": 0.18610530187878963, "learning_rate": 2.0580164641317132e-06, "loss": 0.3251, "step": 2731 }, { "epoch": 2.891005291005291, "grad_norm": 0.17806080573632393, "learning_rate": 2.0384163073304587e-06, "loss": 0.3056, "step": 2732 }, { "epoch": 2.892063492063492, "grad_norm": 0.18795661792601714, "learning_rate": 2.0188161505292046e-06, "loss": 0.3396, "step": 2733 }, { "epoch": 2.893121693121693, "grad_norm": 0.18063664445735603, "learning_rate": 1.99921599372795e-06, "loss": 0.2997, "step": 2734 }, { "epoch": 2.894179894179894, "grad_norm": 0.18511841091215842, "learning_rate": 1.9796158369266955e-06, "loss": 0.3388, "step": 2735 }, { "epoch": 2.895238095238095, "grad_norm": 0.2003527156439393, "learning_rate": 1.9600156801254413e-06, "loss": 0.3554, "step": 2736 }, { "epoch": 2.8962962962962964, "grad_norm": 0.19842421312034014, "learning_rate": 1.940415523324187e-06, "loss": 0.3761, "step": 2737 }, { "epoch": 2.897354497354497, "grad_norm": 0.2742847809137376, "learning_rate": 1.9208153665229322e-06, "loss": 0.374, "step": 2738 }, { "epoch": 2.8984126984126983, "grad_norm": 0.19759325790955387, "learning_rate": 1.901215209721678e-06, "loss": 0.3458, "step": 2739 }, { "epoch": 2.8994708994708995, "grad_norm": 0.1949299864756493, "learning_rate": 1.8816150529204234e-06, "loss": 0.3851, "step": 2740 }, { "epoch": 2.9005291005291003, "grad_norm": 0.18716277246541932, "learning_rate": 1.862014896119169e-06, "loss": 0.3562, "step": 2741 }, { "epoch": 2.9015873015873015, "grad_norm": 0.18868051576949735, "learning_rate": 1.8424147393179145e-06, "loss": 0.3559, "step": 2742 }, { "epoch": 2.9026455026455027, "grad_norm": 0.1992670631408877, "learning_rate": 1.8228145825166603e-06, "loss": 0.3953, "step": 2743 }, { "epoch": 2.9037037037037035, "grad_norm": 0.190011431296961, "learning_rate": 1.803214425715406e-06, "loss": 0.3517, "step": 2744 }, { "epoch": 2.9047619047619047, "grad_norm": 0.1898685284634927, "learning_rate": 1.7836142689141515e-06, "loss": 0.324, "step": 2745 }, { "epoch": 2.905820105820106, "grad_norm": 0.20798301264228833, "learning_rate": 1.7640141121128971e-06, "loss": 0.4022, "step": 2746 }, { "epoch": 2.9068783068783066, "grad_norm": 0.1884618131179482, "learning_rate": 1.7444139553116426e-06, "loss": 0.3162, "step": 2747 }, { "epoch": 2.907936507936508, "grad_norm": 0.2062849190831743, "learning_rate": 1.7248137985103882e-06, "loss": 0.3713, "step": 2748 }, { "epoch": 2.908994708994709, "grad_norm": 0.21572426764959662, "learning_rate": 1.7052136417091337e-06, "loss": 0.3902, "step": 2749 }, { "epoch": 2.91005291005291, "grad_norm": 0.29553314531980623, "learning_rate": 1.6856134849078794e-06, "loss": 0.3407, "step": 2750 }, { "epoch": 2.911111111111111, "grad_norm": 0.20393510044120217, "learning_rate": 1.6660133281066248e-06, "loss": 0.4114, "step": 2751 }, { "epoch": 2.912169312169312, "grad_norm": 0.16933792741507034, "learning_rate": 1.6464131713053705e-06, "loss": 0.3066, "step": 2752 }, { "epoch": 2.9132275132275134, "grad_norm": 0.1996783103654981, "learning_rate": 1.6268130145041163e-06, "loss": 0.3649, "step": 2753 }, { "epoch": 2.914285714285714, "grad_norm": 0.1967458515577118, "learning_rate": 1.6072128577028618e-06, "loss": 0.3568, "step": 2754 }, { "epoch": 2.9153439153439153, "grad_norm": 0.18448247900585343, "learning_rate": 1.5876127009016075e-06, "loss": 0.326, "step": 2755 }, { "epoch": 2.9164021164021166, "grad_norm": 0.19461118793102933, "learning_rate": 1.568012544100353e-06, "loss": 0.3213, "step": 2756 }, { "epoch": 2.9174603174603173, "grad_norm": 0.1899035298993076, "learning_rate": 1.5484123872990986e-06, "loss": 0.3387, "step": 2757 }, { "epoch": 2.9185185185185185, "grad_norm": 0.1973718574120133, "learning_rate": 1.528812230497844e-06, "loss": 0.3617, "step": 2758 }, { "epoch": 2.9195767195767197, "grad_norm": 0.29546988416171227, "learning_rate": 1.5092120736965897e-06, "loss": 0.338, "step": 2759 }, { "epoch": 2.9206349206349205, "grad_norm": 0.18641514359807654, "learning_rate": 1.4896119168953353e-06, "loss": 0.3353, "step": 2760 }, { "epoch": 2.9216931216931217, "grad_norm": 0.1986244733946501, "learning_rate": 1.4700117600940808e-06, "loss": 0.3652, "step": 2761 }, { "epoch": 2.922751322751323, "grad_norm": 0.1840275855494628, "learning_rate": 1.4504116032928265e-06, "loss": 0.34, "step": 2762 }, { "epoch": 2.923809523809524, "grad_norm": 0.1955588620037554, "learning_rate": 1.430811446491572e-06, "loss": 0.3592, "step": 2763 }, { "epoch": 2.924867724867725, "grad_norm": 0.19691689936476783, "learning_rate": 1.4112112896903176e-06, "loss": 0.377, "step": 2764 }, { "epoch": 2.925925925925926, "grad_norm": 0.18009646768548268, "learning_rate": 1.3916111328890632e-06, "loss": 0.3205, "step": 2765 }, { "epoch": 2.9269841269841272, "grad_norm": 0.1898019017223925, "learning_rate": 1.372010976087809e-06, "loss": 0.3549, "step": 2766 }, { "epoch": 2.928042328042328, "grad_norm": 0.19704920070535306, "learning_rate": 1.3524108192865543e-06, "loss": 0.3775, "step": 2767 }, { "epoch": 2.929100529100529, "grad_norm": 0.18705440894929945, "learning_rate": 1.3328106624853e-06, "loss": 0.3265, "step": 2768 }, { "epoch": 2.9301587301587304, "grad_norm": 0.16473519945175205, "learning_rate": 1.3132105056840455e-06, "loss": 0.2888, "step": 2769 }, { "epoch": 2.931216931216931, "grad_norm": 0.19655934519795515, "learning_rate": 1.2936103488827911e-06, "loss": 0.3502, "step": 2770 }, { "epoch": 2.9322751322751324, "grad_norm": 0.19677250020354922, "learning_rate": 1.2740101920815368e-06, "loss": 0.377, "step": 2771 }, { "epoch": 2.9333333333333336, "grad_norm": 0.1937023595690031, "learning_rate": 1.2544100352802822e-06, "loss": 0.3376, "step": 2772 }, { "epoch": 2.9343915343915343, "grad_norm": 0.1884504777247503, "learning_rate": 1.234809878479028e-06, "loss": 0.3534, "step": 2773 }, { "epoch": 2.9354497354497355, "grad_norm": 0.1807776339927901, "learning_rate": 1.2152097216777734e-06, "loss": 0.3001, "step": 2774 }, { "epoch": 2.9365079365079367, "grad_norm": 0.19494893044097508, "learning_rate": 1.1956095648765192e-06, "loss": 0.3601, "step": 2775 }, { "epoch": 2.9375661375661375, "grad_norm": 0.18173859837169326, "learning_rate": 1.1760094080752647e-06, "loss": 0.3351, "step": 2776 }, { "epoch": 2.9386243386243387, "grad_norm": 0.19155272417454372, "learning_rate": 1.1564092512740103e-06, "loss": 0.3246, "step": 2777 }, { "epoch": 2.93968253968254, "grad_norm": 0.18097278381295773, "learning_rate": 1.1368090944727558e-06, "loss": 0.3325, "step": 2778 }, { "epoch": 2.9407407407407407, "grad_norm": 0.18795150930344018, "learning_rate": 1.1172089376715015e-06, "loss": 0.3289, "step": 2779 }, { "epoch": 2.941798941798942, "grad_norm": 0.19915958751004778, "learning_rate": 1.0976087808702471e-06, "loss": 0.3626, "step": 2780 }, { "epoch": 2.942857142857143, "grad_norm": 0.21232105482687197, "learning_rate": 1.0780086240689926e-06, "loss": 0.4117, "step": 2781 }, { "epoch": 2.943915343915344, "grad_norm": 0.1864031377940111, "learning_rate": 1.0584084672677382e-06, "loss": 0.3617, "step": 2782 }, { "epoch": 2.944973544973545, "grad_norm": 0.20266505522326464, "learning_rate": 1.0388083104664837e-06, "loss": 0.4006, "step": 2783 }, { "epoch": 2.9460317460317462, "grad_norm": 0.17805463329627502, "learning_rate": 1.0192081536652293e-06, "loss": 0.3185, "step": 2784 }, { "epoch": 2.947089947089947, "grad_norm": 0.1865921965755665, "learning_rate": 9.99607996863975e-07, "loss": 0.3616, "step": 2785 }, { "epoch": 2.948148148148148, "grad_norm": 0.19565848130960645, "learning_rate": 9.800078400627207e-07, "loss": 0.3565, "step": 2786 }, { "epoch": 2.9492063492063494, "grad_norm": 0.19030143596622032, "learning_rate": 9.604076832614661e-07, "loss": 0.3379, "step": 2787 }, { "epoch": 2.95026455026455, "grad_norm": 0.2078056379962277, "learning_rate": 9.408075264602117e-07, "loss": 0.3939, "step": 2788 }, { "epoch": 2.9513227513227513, "grad_norm": 0.18813468584278845, "learning_rate": 9.212073696589572e-07, "loss": 0.3346, "step": 2789 }, { "epoch": 2.9523809523809526, "grad_norm": 0.17678708921482536, "learning_rate": 9.01607212857703e-07, "loss": 0.3052, "step": 2790 }, { "epoch": 2.9534391534391533, "grad_norm": 0.18575208703680052, "learning_rate": 8.820070560564486e-07, "loss": 0.323, "step": 2791 }, { "epoch": 2.9544973544973545, "grad_norm": 0.18700053280637205, "learning_rate": 8.624068992551941e-07, "loss": 0.3312, "step": 2792 }, { "epoch": 2.9555555555555557, "grad_norm": 0.1947341580545555, "learning_rate": 8.428067424539397e-07, "loss": 0.3652, "step": 2793 }, { "epoch": 2.9566137566137565, "grad_norm": 0.18336157333209027, "learning_rate": 8.232065856526852e-07, "loss": 0.299, "step": 2794 }, { "epoch": 2.9576719576719577, "grad_norm": 0.1989438129617965, "learning_rate": 8.036064288514309e-07, "loss": 0.3758, "step": 2795 }, { "epoch": 2.958730158730159, "grad_norm": 0.1795010261425146, "learning_rate": 7.840062720501765e-07, "loss": 0.3095, "step": 2796 }, { "epoch": 2.9597883597883596, "grad_norm": 0.1878025427152609, "learning_rate": 7.64406115248922e-07, "loss": 0.343, "step": 2797 }, { "epoch": 2.960846560846561, "grad_norm": 0.1860910101745494, "learning_rate": 7.448059584476677e-07, "loss": 0.3434, "step": 2798 }, { "epoch": 2.961904761904762, "grad_norm": 0.2080637929191924, "learning_rate": 7.252058016464132e-07, "loss": 0.3734, "step": 2799 }, { "epoch": 2.962962962962963, "grad_norm": 0.19340393517672708, "learning_rate": 7.056056448451588e-07, "loss": 0.3631, "step": 2800 }, { "epoch": 2.964021164021164, "grad_norm": 0.20271654196591155, "learning_rate": 6.860054880439044e-07, "loss": 0.3958, "step": 2801 }, { "epoch": 2.965079365079365, "grad_norm": 0.19696852611219293, "learning_rate": 6.6640533124265e-07, "loss": 0.3719, "step": 2802 }, { "epoch": 2.966137566137566, "grad_norm": 0.1884693790216164, "learning_rate": 6.468051744413956e-07, "loss": 0.3243, "step": 2803 }, { "epoch": 2.967195767195767, "grad_norm": 0.1875853128029582, "learning_rate": 6.272050176401411e-07, "loss": 0.3405, "step": 2804 }, { "epoch": 2.9682539682539684, "grad_norm": 0.19330130887895539, "learning_rate": 6.076048608388867e-07, "loss": 0.3742, "step": 2805 }, { "epoch": 2.969312169312169, "grad_norm": 0.2007548359643975, "learning_rate": 5.880047040376323e-07, "loss": 0.3831, "step": 2806 }, { "epoch": 2.9703703703703703, "grad_norm": 0.18320658631267467, "learning_rate": 5.684045472363779e-07, "loss": 0.3455, "step": 2807 }, { "epoch": 2.9714285714285715, "grad_norm": 0.19352908453997061, "learning_rate": 5.488043904351236e-07, "loss": 0.3337, "step": 2808 }, { "epoch": 2.9724867724867723, "grad_norm": 0.18917045289367823, "learning_rate": 5.292042336338691e-07, "loss": 0.3668, "step": 2809 }, { "epoch": 2.9735449735449735, "grad_norm": 0.28331421422089914, "learning_rate": 5.096040768326147e-07, "loss": 0.3877, "step": 2810 }, { "epoch": 2.9746031746031747, "grad_norm": 0.18730859428046542, "learning_rate": 4.900039200313603e-07, "loss": 0.3235, "step": 2811 }, { "epoch": 2.9756613756613755, "grad_norm": 0.2061919680406255, "learning_rate": 4.7040376323010584e-07, "loss": 0.3877, "step": 2812 }, { "epoch": 2.9767195767195767, "grad_norm": 0.17991475264599735, "learning_rate": 4.508036064288515e-07, "loss": 0.3117, "step": 2813 }, { "epoch": 2.977777777777778, "grad_norm": 0.18979406930672754, "learning_rate": 4.3120344962759706e-07, "loss": 0.3523, "step": 2814 }, { "epoch": 2.9788359788359786, "grad_norm": 0.20105316878907992, "learning_rate": 4.116032928263426e-07, "loss": 0.3547, "step": 2815 }, { "epoch": 2.97989417989418, "grad_norm": 0.18428131124919112, "learning_rate": 3.920031360250882e-07, "loss": 0.323, "step": 2816 }, { "epoch": 2.980952380952381, "grad_norm": 0.1812934812970745, "learning_rate": 3.7240297922383384e-07, "loss": 0.3181, "step": 2817 }, { "epoch": 2.982010582010582, "grad_norm": 0.1863114001092859, "learning_rate": 3.528028224225794e-07, "loss": 0.3177, "step": 2818 }, { "epoch": 2.983068783068783, "grad_norm": 0.18605121181705242, "learning_rate": 3.33202665621325e-07, "loss": 0.336, "step": 2819 }, { "epoch": 2.984126984126984, "grad_norm": 0.20781903726047113, "learning_rate": 3.1360250882007056e-07, "loss": 0.4122, "step": 2820 }, { "epoch": 2.985185185185185, "grad_norm": 0.2016418889379911, "learning_rate": 2.9400235201881617e-07, "loss": 0.3835, "step": 2821 }, { "epoch": 2.986243386243386, "grad_norm": 0.17435892542662546, "learning_rate": 2.744021952175618e-07, "loss": 0.302, "step": 2822 }, { "epoch": 2.9873015873015873, "grad_norm": 0.18917626577977661, "learning_rate": 2.5480203841630734e-07, "loss": 0.3654, "step": 2823 }, { "epoch": 2.988359788359788, "grad_norm": 0.18622957441576268, "learning_rate": 2.3520188161505292e-07, "loss": 0.3388, "step": 2824 }, { "epoch": 2.9894179894179893, "grad_norm": 0.1859174632852059, "learning_rate": 2.1560172481379853e-07, "loss": 0.3393, "step": 2825 }, { "epoch": 2.9904761904761905, "grad_norm": 0.19065097631350006, "learning_rate": 1.960015680125441e-07, "loss": 0.3422, "step": 2826 }, { "epoch": 2.9915343915343913, "grad_norm": 0.18272454868166982, "learning_rate": 1.764014112112897e-07, "loss": 0.3127, "step": 2827 }, { "epoch": 2.9925925925925925, "grad_norm": 0.18725014042815322, "learning_rate": 1.5680125441003528e-07, "loss": 0.3275, "step": 2828 }, { "epoch": 2.9936507936507937, "grad_norm": 0.17561038959172634, "learning_rate": 1.372010976087809e-07, "loss": 0.3066, "step": 2829 }, { "epoch": 2.9947089947089944, "grad_norm": 0.18415272195204574, "learning_rate": 1.1760094080752646e-07, "loss": 0.3328, "step": 2830 }, { "epoch": 2.9957671957671956, "grad_norm": 0.17845429273021535, "learning_rate": 9.800078400627206e-08, "loss": 0.2933, "step": 2831 }, { "epoch": 2.996825396825397, "grad_norm": 0.1784414113949217, "learning_rate": 7.840062720501764e-08, "loss": 0.3129, "step": 2832 }, { "epoch": 2.997883597883598, "grad_norm": 0.18326094077219415, "learning_rate": 5.880047040376323e-08, "loss": 0.339, "step": 2833 }, { "epoch": 2.998941798941799, "grad_norm": 0.181202143145444, "learning_rate": 3.920031360250882e-08, "loss": 0.2975, "step": 2834 }, { "epoch": 3.0, "grad_norm": 1.1791412063210063, "learning_rate": 1.960015680125441e-08, "loss": 0.3422, "step": 2835 }, { "epoch": 3.0, "step": 2835, "total_flos": 3.152810456905679e+19, "train_loss": 0.5287227819941451, "train_runtime": 91700.2784, "train_samples_per_second": 0.495, "train_steps_per_second": 0.031 } ], "logging_steps": 1, "max_steps": 2835, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.152810456905679e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }