diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19888 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2835, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010582010582010583, + "grad_norm": 49.71796324571249, + "learning_rate": 0.0, + "loss": 11.3407, + "step": 1 + }, + { + "epoch": 0.0021164021164021165, + "grad_norm": 47.138920498887764, + "learning_rate": 1.7605633802816901e-07, + "loss": 11.3367, + "step": 2 + }, + { + "epoch": 0.0031746031746031746, + "grad_norm": 45.86427493870416, + "learning_rate": 3.5211267605633803e-07, + "loss": 11.3782, + "step": 3 + }, + { + "epoch": 0.004232804232804233, + "grad_norm": 46.592216421647926, + "learning_rate": 5.28169014084507e-07, + "loss": 11.4448, + "step": 4 + }, + { + "epoch": 0.005291005291005291, + "grad_norm": 51.27861968325324, + "learning_rate": 7.042253521126761e-07, + "loss": 11.1976, + "step": 5 + }, + { + "epoch": 0.006349206349206349, + "grad_norm": 48.68604904156757, + "learning_rate": 8.802816901408452e-07, + "loss": 11.3589, + "step": 6 + }, + { + "epoch": 0.007407407407407408, + "grad_norm": 49.04868842468927, + "learning_rate": 1.056338028169014e-06, + "loss": 11.216, + "step": 7 + }, + { + "epoch": 0.008465608465608466, + "grad_norm": 45.78424778978211, + "learning_rate": 1.232394366197183e-06, + "loss": 11.2126, + "step": 8 + }, + { + "epoch": 0.009523809523809525, + "grad_norm": 48.288808560697824, + "learning_rate": 1.4084507042253521e-06, + "loss": 11.2473, + "step": 9 + }, + { + "epoch": 0.010582010582010581, + "grad_norm": 47.00529127513613, + "learning_rate": 1.5845070422535212e-06, + "loss": 11.181, + "step": 10 + }, + { + "epoch": 0.01164021164021164, + "grad_norm": 69.96523123803429, + "learning_rate": 1.7605633802816904e-06, + "loss": 10.1444, + "step": 11 + }, + { + "epoch": 0.012698412698412698, + "grad_norm": 67.99361968629948, + "learning_rate": 1.936619718309859e-06, + "loss": 10.2666, + "step": 12 + }, + { + "epoch": 0.013756613756613757, + "grad_norm": 82.7361163900619, + "learning_rate": 2.112676056338028e-06, + "loss": 9.6636, + "step": 13 + }, + { + "epoch": 0.014814814814814815, + "grad_norm": 77.91883219725891, + "learning_rate": 2.2887323943661975e-06, + "loss": 9.7129, + "step": 14 + }, + { + "epoch": 0.015873015873015872, + "grad_norm": 80.69870504421503, + "learning_rate": 2.464788732394366e-06, + "loss": 5.0932, + "step": 15 + }, + { + "epoch": 0.016931216931216932, + "grad_norm": 73.85506610645764, + "learning_rate": 2.640845070422535e-06, + "loss": 4.7543, + "step": 16 + }, + { + "epoch": 0.01798941798941799, + "grad_norm": 72.44122211654165, + "learning_rate": 2.8169014084507042e-06, + "loss": 4.7132, + "step": 17 + }, + { + "epoch": 0.01904761904761905, + "grad_norm": 54.80640084894919, + "learning_rate": 2.9929577464788733e-06, + "loss": 3.8029, + "step": 18 + }, + { + "epoch": 0.020105820105820106, + "grad_norm": 51.95247253108577, + "learning_rate": 3.1690140845070423e-06, + "loss": 3.7085, + "step": 19 + }, + { + "epoch": 0.021164021164021163, + "grad_norm": 14.24491850890378, + "learning_rate": 3.3450704225352113e-06, + "loss": 2.1113, + "step": 20 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 6.987157752616172, + "learning_rate": 3.521126760563381e-06, + "loss": 1.7438, + "step": 21 + }, + { + "epoch": 0.02328042328042328, + "grad_norm": 5.830944797295927, + "learning_rate": 3.6971830985915494e-06, + "loss": 1.7295, + "step": 22 + }, + { + "epoch": 0.02433862433862434, + "grad_norm": 5.138674915616038, + "learning_rate": 3.873239436619718e-06, + "loss": 1.6184, + "step": 23 + }, + { + "epoch": 0.025396825396825397, + "grad_norm": 3.8763256924499605, + "learning_rate": 4.0492957746478875e-06, + "loss": 1.5636, + "step": 24 + }, + { + "epoch": 0.026455026455026454, + "grad_norm": 4.336003458578358, + "learning_rate": 4.225352112676056e-06, + "loss": 1.5242, + "step": 25 + }, + { + "epoch": 0.027513227513227514, + "grad_norm": 3.002308914115821, + "learning_rate": 4.401408450704226e-06, + "loss": 1.5592, + "step": 26 + }, + { + "epoch": 0.02857142857142857, + "grad_norm": 2.211668797457168, + "learning_rate": 4.577464788732395e-06, + "loss": 1.3246, + "step": 27 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 1.856090631854003, + "learning_rate": 4.753521126760564e-06, + "loss": 1.3504, + "step": 28 + }, + { + "epoch": 0.030687830687830688, + "grad_norm": 1.730441467550159, + "learning_rate": 4.929577464788732e-06, + "loss": 1.1383, + "step": 29 + }, + { + "epoch": 0.031746031746031744, + "grad_norm": 3.037278309026151, + "learning_rate": 5.105633802816902e-06, + "loss": 1.0615, + "step": 30 + }, + { + "epoch": 0.0328042328042328, + "grad_norm": 20.463734102356668, + "learning_rate": 5.28169014084507e-06, + "loss": 1.1804, + "step": 31 + }, + { + "epoch": 0.033862433862433865, + "grad_norm": 1.1822365397348313, + "learning_rate": 5.457746478873239e-06, + "loss": 1.1662, + "step": 32 + }, + { + "epoch": 0.03492063492063492, + "grad_norm": 0.9353301639240665, + "learning_rate": 5.6338028169014084e-06, + "loss": 1.0131, + "step": 33 + }, + { + "epoch": 0.03597883597883598, + "grad_norm": 0.9206265704946479, + "learning_rate": 5.809859154929578e-06, + "loss": 1.0732, + "step": 34 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 11.424200349684694, + "learning_rate": 5.9859154929577465e-06, + "loss": 1.1348, + "step": 35 + }, + { + "epoch": 0.0380952380952381, + "grad_norm": 0.8604419013240469, + "learning_rate": 6.161971830985916e-06, + "loss": 1.0904, + "step": 36 + }, + { + "epoch": 0.039153439153439155, + "grad_norm": 0.779260466106171, + "learning_rate": 6.338028169014085e-06, + "loss": 0.9949, + "step": 37 + }, + { + "epoch": 0.04021164021164021, + "grad_norm": 1.053891911809051, + "learning_rate": 6.514084507042253e-06, + "loss": 0.9232, + "step": 38 + }, + { + "epoch": 0.04126984126984127, + "grad_norm": 0.8065280665981377, + "learning_rate": 6.690140845070423e-06, + "loss": 1.0275, + "step": 39 + }, + { + "epoch": 0.042328042328042326, + "grad_norm": 0.5932376581872476, + "learning_rate": 6.866197183098592e-06, + "loss": 0.9951, + "step": 40 + }, + { + "epoch": 0.04338624338624339, + "grad_norm": 0.7470084393654567, + "learning_rate": 7.042253521126762e-06, + "loss": 0.9576, + "step": 41 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.7190983046762137, + "learning_rate": 7.21830985915493e-06, + "loss": 1.0888, + "step": 42 + }, + { + "epoch": 0.0455026455026455, + "grad_norm": 0.6401594584722508, + "learning_rate": 7.394366197183099e-06, + "loss": 0.9013, + "step": 43 + }, + { + "epoch": 0.04656084656084656, + "grad_norm": 0.5484291413984935, + "learning_rate": 7.5704225352112675e-06, + "loss": 1.0244, + "step": 44 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.5552253491849386, + "learning_rate": 7.746478873239436e-06, + "loss": 0.9532, + "step": 45 + }, + { + "epoch": 0.04867724867724868, + "grad_norm": 0.5652331360674767, + "learning_rate": 7.922535211267606e-06, + "loss": 0.9989, + "step": 46 + }, + { + "epoch": 0.04973544973544974, + "grad_norm": 0.5293118764740489, + "learning_rate": 8.098591549295775e-06, + "loss": 0.9298, + "step": 47 + }, + { + "epoch": 0.050793650793650794, + "grad_norm": 0.5285967447293929, + "learning_rate": 8.274647887323944e-06, + "loss": 0.9737, + "step": 48 + }, + { + "epoch": 0.05185185185185185, + "grad_norm": 0.5178212820525971, + "learning_rate": 8.450704225352112e-06, + "loss": 0.9258, + "step": 49 + }, + { + "epoch": 0.05291005291005291, + "grad_norm": 0.4733170168194918, + "learning_rate": 8.626760563380283e-06, + "loss": 0.9534, + "step": 50 + }, + { + "epoch": 0.05396825396825397, + "grad_norm": 0.5149256098520258, + "learning_rate": 8.802816901408451e-06, + "loss": 1.0816, + "step": 51 + }, + { + "epoch": 0.05502645502645503, + "grad_norm": 0.4685693575510743, + "learning_rate": 8.978873239436621e-06, + "loss": 1.027, + "step": 52 + }, + { + "epoch": 0.056084656084656084, + "grad_norm": 0.39387712556840015, + "learning_rate": 9.15492957746479e-06, + "loss": 0.8691, + "step": 53 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.420846981768801, + "learning_rate": 9.330985915492959e-06, + "loss": 0.8751, + "step": 54 + }, + { + "epoch": 0.0582010582010582, + "grad_norm": 0.3985823470678549, + "learning_rate": 9.507042253521127e-06, + "loss": 0.7835, + "step": 55 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 0.4015984425355995, + "learning_rate": 9.683098591549296e-06, + "loss": 0.8231, + "step": 56 + }, + { + "epoch": 0.06031746031746032, + "grad_norm": 0.41302581392299265, + "learning_rate": 9.859154929577465e-06, + "loss": 0.9619, + "step": 57 + }, + { + "epoch": 0.061375661375661375, + "grad_norm": 0.3973364065159126, + "learning_rate": 1.0035211267605635e-05, + "loss": 0.8624, + "step": 58 + }, + { + "epoch": 0.06243386243386243, + "grad_norm": 0.3600376955888632, + "learning_rate": 1.0211267605633803e-05, + "loss": 0.9197, + "step": 59 + }, + { + "epoch": 0.06349206349206349, + "grad_norm": 0.38943169766010316, + "learning_rate": 1.0387323943661972e-05, + "loss": 0.7723, + "step": 60 + }, + { + "epoch": 0.06455026455026455, + "grad_norm": 0.40177295774598387, + "learning_rate": 1.056338028169014e-05, + "loss": 0.8586, + "step": 61 + }, + { + "epoch": 0.0656084656084656, + "grad_norm": 0.36974015962677353, + "learning_rate": 1.073943661971831e-05, + "loss": 0.8905, + "step": 62 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.3669768884030218, + "learning_rate": 1.0915492957746478e-05, + "loss": 0.8458, + "step": 63 + }, + { + "epoch": 0.06772486772486773, + "grad_norm": 0.4194352772922352, + "learning_rate": 1.1091549295774648e-05, + "loss": 0.92, + "step": 64 + }, + { + "epoch": 0.06878306878306878, + "grad_norm": 0.33216123239758316, + "learning_rate": 1.1267605633802817e-05, + "loss": 0.7894, + "step": 65 + }, + { + "epoch": 0.06984126984126984, + "grad_norm": 0.32639923159440226, + "learning_rate": 1.1443661971830987e-05, + "loss": 0.8095, + "step": 66 + }, + { + "epoch": 0.07089947089947089, + "grad_norm": 0.3829927205982677, + "learning_rate": 1.1619718309859156e-05, + "loss": 0.8557, + "step": 67 + }, + { + "epoch": 0.07195767195767196, + "grad_norm": 0.3222649310951342, + "learning_rate": 1.1795774647887324e-05, + "loss": 0.8743, + "step": 68 + }, + { + "epoch": 0.07301587301587302, + "grad_norm": 0.30020466526689116, + "learning_rate": 1.1971830985915493e-05, + "loss": 0.7349, + "step": 69 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.3519372592943403, + "learning_rate": 1.2147887323943663e-05, + "loss": 0.7388, + "step": 70 + }, + { + "epoch": 0.07513227513227513, + "grad_norm": 0.318183168764709, + "learning_rate": 1.2323943661971832e-05, + "loss": 0.7519, + "step": 71 + }, + { + "epoch": 0.0761904761904762, + "grad_norm": 0.310753241584739, + "learning_rate": 1.25e-05, + "loss": 0.7459, + "step": 72 + }, + { + "epoch": 0.07724867724867725, + "grad_norm": 0.3517552226109988, + "learning_rate": 1.267605633802817e-05, + "loss": 0.9719, + "step": 73 + }, + { + "epoch": 0.07830687830687831, + "grad_norm": 0.3213372230642392, + "learning_rate": 1.2852112676056338e-05, + "loss": 0.7786, + "step": 74 + }, + { + "epoch": 0.07936507936507936, + "grad_norm": 1.014002688907997, + "learning_rate": 1.3028169014084506e-05, + "loss": 0.8212, + "step": 75 + }, + { + "epoch": 0.08042328042328042, + "grad_norm": 0.2970169734383055, + "learning_rate": 1.3204225352112675e-05, + "loss": 0.8132, + "step": 76 + }, + { + "epoch": 0.08148148148148149, + "grad_norm": 0.305476164829128, + "learning_rate": 1.3380281690140845e-05, + "loss": 0.792, + "step": 77 + }, + { + "epoch": 0.08253968253968254, + "grad_norm": 0.30966098583703827, + "learning_rate": 1.3556338028169016e-05, + "loss": 0.7983, + "step": 78 + }, + { + "epoch": 0.0835978835978836, + "grad_norm": 0.35996391292327967, + "learning_rate": 1.3732394366197184e-05, + "loss": 0.8742, + "step": 79 + }, + { + "epoch": 0.08465608465608465, + "grad_norm": 0.30968125317223627, + "learning_rate": 1.3908450704225353e-05, + "loss": 0.748, + "step": 80 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.35611081185640775, + "learning_rate": 1.4084507042253523e-05, + "loss": 0.853, + "step": 81 + }, + { + "epoch": 0.08677248677248678, + "grad_norm": 0.2966375377032811, + "learning_rate": 1.4260563380281692e-05, + "loss": 0.7151, + "step": 82 + }, + { + "epoch": 0.08783068783068783, + "grad_norm": 0.7068384588765272, + "learning_rate": 1.443661971830986e-05, + "loss": 0.7145, + "step": 83 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.356149749502437, + "learning_rate": 1.4612676056338029e-05, + "loss": 0.7752, + "step": 84 + }, + { + "epoch": 0.08994708994708994, + "grad_norm": 0.30485993113060483, + "learning_rate": 1.4788732394366198e-05, + "loss": 0.6923, + "step": 85 + }, + { + "epoch": 0.091005291005291, + "grad_norm": 0.3283908018328014, + "learning_rate": 1.4964788732394366e-05, + "loss": 0.8052, + "step": 86 + }, + { + "epoch": 0.09206349206349207, + "grad_norm": 0.3272519816451936, + "learning_rate": 1.5140845070422535e-05, + "loss": 0.6502, + "step": 87 + }, + { + "epoch": 0.09312169312169312, + "grad_norm": 0.3540380713478706, + "learning_rate": 1.5316901408450704e-05, + "loss": 0.8252, + "step": 88 + }, + { + "epoch": 0.09417989417989418, + "grad_norm": 0.32708774961615394, + "learning_rate": 1.5492957746478872e-05, + "loss": 0.8256, + "step": 89 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.3078761207400959, + "learning_rate": 1.5669014084507044e-05, + "loss": 0.7787, + "step": 90 + }, + { + "epoch": 0.0962962962962963, + "grad_norm": 0.3085651879842953, + "learning_rate": 1.5845070422535213e-05, + "loss": 0.7873, + "step": 91 + }, + { + "epoch": 0.09735449735449736, + "grad_norm": 0.3411588858236989, + "learning_rate": 1.602112676056338e-05, + "loss": 0.7641, + "step": 92 + }, + { + "epoch": 0.09841269841269841, + "grad_norm": 0.30194570638829077, + "learning_rate": 1.619718309859155e-05, + "loss": 0.6853, + "step": 93 + }, + { + "epoch": 0.09947089947089947, + "grad_norm": 0.3417269654477643, + "learning_rate": 1.637323943661972e-05, + "loss": 0.7916, + "step": 94 + }, + { + "epoch": 0.10052910052910052, + "grad_norm": 0.31589650880434206, + "learning_rate": 1.6549295774647887e-05, + "loss": 0.7737, + "step": 95 + }, + { + "epoch": 0.10158730158730159, + "grad_norm": 0.31367036126327846, + "learning_rate": 1.6725352112676056e-05, + "loss": 0.7542, + "step": 96 + }, + { + "epoch": 0.10264550264550265, + "grad_norm": 0.31057819203477377, + "learning_rate": 1.6901408450704224e-05, + "loss": 0.6593, + "step": 97 + }, + { + "epoch": 0.1037037037037037, + "grad_norm": 0.34793873952604554, + "learning_rate": 1.7077464788732393e-05, + "loss": 0.8028, + "step": 98 + }, + { + "epoch": 0.10476190476190476, + "grad_norm": 0.3480073285637551, + "learning_rate": 1.7253521126760565e-05, + "loss": 0.7468, + "step": 99 + }, + { + "epoch": 0.10582010582010581, + "grad_norm": 0.28880176690327597, + "learning_rate": 1.7429577464788734e-05, + "loss": 0.692, + "step": 100 + }, + { + "epoch": 0.10687830687830688, + "grad_norm": 0.3138056785283419, + "learning_rate": 1.7605633802816902e-05, + "loss": 0.6864, + "step": 101 + }, + { + "epoch": 0.10793650793650794, + "grad_norm": 0.32366719651656767, + "learning_rate": 1.778169014084507e-05, + "loss": 0.7089, + "step": 102 + }, + { + "epoch": 0.10899470899470899, + "grad_norm": 0.33175345049430516, + "learning_rate": 1.7957746478873243e-05, + "loss": 0.8091, + "step": 103 + }, + { + "epoch": 0.11005291005291006, + "grad_norm": 0.3133523678106677, + "learning_rate": 1.813380281690141e-05, + "loss": 0.7224, + "step": 104 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.3061911187313938, + "learning_rate": 1.830985915492958e-05, + "loss": 0.7376, + "step": 105 + }, + { + "epoch": 0.11216931216931217, + "grad_norm": 0.3006228205551271, + "learning_rate": 1.848591549295775e-05, + "loss": 0.7126, + "step": 106 + }, + { + "epoch": 0.11322751322751323, + "grad_norm": 0.30813453357630666, + "learning_rate": 1.8661971830985917e-05, + "loss": 0.6727, + "step": 107 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.2877128391886516, + "learning_rate": 1.8838028169014086e-05, + "loss": 0.661, + "step": 108 + }, + { + "epoch": 0.11534391534391535, + "grad_norm": 0.3638361745005654, + "learning_rate": 1.9014084507042255e-05, + "loss": 0.7072, + "step": 109 + }, + { + "epoch": 0.1164021164021164, + "grad_norm": 0.30920630591593606, + "learning_rate": 1.9190140845070423e-05, + "loss": 0.6213, + "step": 110 + }, + { + "epoch": 0.11746031746031746, + "grad_norm": 0.5492672965501992, + "learning_rate": 1.9366197183098592e-05, + "loss": 0.7353, + "step": 111 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 0.3885248761167444, + "learning_rate": 1.954225352112676e-05, + "loss": 0.8189, + "step": 112 + }, + { + "epoch": 0.11957671957671957, + "grad_norm": 0.31766936558876047, + "learning_rate": 1.971830985915493e-05, + "loss": 0.7251, + "step": 113 + }, + { + "epoch": 0.12063492063492064, + "grad_norm": 0.3878825328465268, + "learning_rate": 1.98943661971831e-05, + "loss": 0.7747, + "step": 114 + }, + { + "epoch": 0.12169312169312169, + "grad_norm": 0.3260125841556687, + "learning_rate": 2.007042253521127e-05, + "loss": 0.6867, + "step": 115 + }, + { + "epoch": 0.12275132275132275, + "grad_norm": 0.3582626153310118, + "learning_rate": 2.024647887323944e-05, + "loss": 0.6981, + "step": 116 + }, + { + "epoch": 0.12380952380952381, + "grad_norm": 0.3585469784947143, + "learning_rate": 2.0422535211267607e-05, + "loss": 0.7825, + "step": 117 + }, + { + "epoch": 0.12486772486772486, + "grad_norm": 0.3442159652247866, + "learning_rate": 2.0598591549295776e-05, + "loss": 0.7701, + "step": 118 + }, + { + "epoch": 0.1259259259259259, + "grad_norm": 0.3403066620592321, + "learning_rate": 2.0774647887323944e-05, + "loss": 0.6793, + "step": 119 + }, + { + "epoch": 0.12698412698412698, + "grad_norm": 0.33853441823411223, + "learning_rate": 2.0950704225352113e-05, + "loss": 0.6876, + "step": 120 + }, + { + "epoch": 0.12804232804232804, + "grad_norm": 0.3153321491578789, + "learning_rate": 2.112676056338028e-05, + "loss": 0.5972, + "step": 121 + }, + { + "epoch": 0.1291005291005291, + "grad_norm": 0.41035577764059683, + "learning_rate": 2.130281690140845e-05, + "loss": 0.7687, + "step": 122 + }, + { + "epoch": 0.13015873015873017, + "grad_norm": 0.34224845150721145, + "learning_rate": 2.147887323943662e-05, + "loss": 0.6678, + "step": 123 + }, + { + "epoch": 0.1312169312169312, + "grad_norm": 0.37838653308297, + "learning_rate": 2.1654929577464787e-05, + "loss": 0.664, + "step": 124 + }, + { + "epoch": 0.13227513227513227, + "grad_norm": 0.356256058295494, + "learning_rate": 2.1830985915492956e-05, + "loss": 0.7416, + "step": 125 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.3670688477705132, + "learning_rate": 2.2007042253521128e-05, + "loss": 0.6603, + "step": 126 + }, + { + "epoch": 0.1343915343915344, + "grad_norm": 0.3696864614307807, + "learning_rate": 2.2183098591549297e-05, + "loss": 0.6723, + "step": 127 + }, + { + "epoch": 0.13544973544973546, + "grad_norm": 0.305168106942622, + "learning_rate": 2.2359154929577465e-05, + "loss": 0.6295, + "step": 128 + }, + { + "epoch": 0.1365079365079365, + "grad_norm": 1.5741882840916825, + "learning_rate": 2.2535211267605634e-05, + "loss": 0.738, + "step": 129 + }, + { + "epoch": 0.13756613756613756, + "grad_norm": 0.40676802977095083, + "learning_rate": 2.2711267605633806e-05, + "loss": 0.6888, + "step": 130 + }, + { + "epoch": 0.13862433862433862, + "grad_norm": 0.40640312190176336, + "learning_rate": 2.2887323943661974e-05, + "loss": 0.8524, + "step": 131 + }, + { + "epoch": 0.13968253968253969, + "grad_norm": 0.3996709103146616, + "learning_rate": 2.3063380281690143e-05, + "loss": 0.6707, + "step": 132 + }, + { + "epoch": 0.14074074074074075, + "grad_norm": 0.3626239173606667, + "learning_rate": 2.323943661971831e-05, + "loss": 0.7601, + "step": 133 + }, + { + "epoch": 0.14179894179894179, + "grad_norm": 0.3889512118071032, + "learning_rate": 2.341549295774648e-05, + "loss": 0.6534, + "step": 134 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.439542393481695, + "learning_rate": 2.359154929577465e-05, + "loss": 0.7289, + "step": 135 + }, + { + "epoch": 0.1439153439153439, + "grad_norm": 0.36264558696354626, + "learning_rate": 2.3767605633802817e-05, + "loss": 0.6388, + "step": 136 + }, + { + "epoch": 0.14497354497354498, + "grad_norm": 0.3630801750617645, + "learning_rate": 2.3943661971830986e-05, + "loss": 0.7319, + "step": 137 + }, + { + "epoch": 0.14603174603174604, + "grad_norm": 0.32798334573994153, + "learning_rate": 2.4119718309859158e-05, + "loss": 0.7374, + "step": 138 + }, + { + "epoch": 0.14708994708994708, + "grad_norm": 0.3595935453918023, + "learning_rate": 2.4295774647887327e-05, + "loss": 0.6395, + "step": 139 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.40173981052190955, + "learning_rate": 2.4471830985915495e-05, + "loss": 0.6469, + "step": 140 + }, + { + "epoch": 0.1492063492063492, + "grad_norm": 0.3567439766311727, + "learning_rate": 2.4647887323943664e-05, + "loss": 0.641, + "step": 141 + }, + { + "epoch": 0.15026455026455027, + "grad_norm": 0.44153899522902645, + "learning_rate": 2.4823943661971833e-05, + "loss": 0.6945, + "step": 142 + }, + { + "epoch": 0.15132275132275133, + "grad_norm": 0.41760121811058754, + "learning_rate": 2.5e-05, + "loss": 0.6683, + "step": 143 + }, + { + "epoch": 0.1523809523809524, + "grad_norm": 0.3532367100024593, + "learning_rate": 2.517605633802817e-05, + "loss": 0.6531, + "step": 144 + }, + { + "epoch": 0.15343915343915343, + "grad_norm": 0.36386964811751354, + "learning_rate": 2.535211267605634e-05, + "loss": 0.7372, + "step": 145 + }, + { + "epoch": 0.1544973544973545, + "grad_norm": 0.3667487681947238, + "learning_rate": 2.5528169014084507e-05, + "loss": 0.6541, + "step": 146 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.32814632317286685, + "learning_rate": 2.5704225352112676e-05, + "loss": 0.7053, + "step": 147 + }, + { + "epoch": 0.15661375661375662, + "grad_norm": 0.36213420149359243, + "learning_rate": 2.5880281690140844e-05, + "loss": 0.7516, + "step": 148 + }, + { + "epoch": 0.15767195767195769, + "grad_norm": 0.3388384477751442, + "learning_rate": 2.6056338028169013e-05, + "loss": 0.6326, + "step": 149 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 0.3739254654391917, + "learning_rate": 2.623239436619718e-05, + "loss": 0.6875, + "step": 150 + }, + { + "epoch": 0.15978835978835979, + "grad_norm": 0.37488629646986044, + "learning_rate": 2.640845070422535e-05, + "loss": 0.6842, + "step": 151 + }, + { + "epoch": 0.16084656084656085, + "grad_norm": 0.3315805811522645, + "learning_rate": 2.658450704225352e-05, + "loss": 0.7401, + "step": 152 + }, + { + "epoch": 0.1619047619047619, + "grad_norm": 0.32245214995851174, + "learning_rate": 2.676056338028169e-05, + "loss": 0.6606, + "step": 153 + }, + { + "epoch": 0.16296296296296298, + "grad_norm": 0.29179792377833336, + "learning_rate": 2.693661971830986e-05, + "loss": 0.6626, + "step": 154 + }, + { + "epoch": 0.164021164021164, + "grad_norm": 0.3645068699975821, + "learning_rate": 2.711267605633803e-05, + "loss": 0.7877, + "step": 155 + }, + { + "epoch": 0.16507936507936508, + "grad_norm": 0.3120322106769627, + "learning_rate": 2.72887323943662e-05, + "loss": 0.6277, + "step": 156 + }, + { + "epoch": 0.16613756613756614, + "grad_norm": 0.31281154861735205, + "learning_rate": 2.746478873239437e-05, + "loss": 0.6771, + "step": 157 + }, + { + "epoch": 0.1671957671957672, + "grad_norm": 0.379934869718354, + "learning_rate": 2.7640845070422537e-05, + "loss": 0.6179, + "step": 158 + }, + { + "epoch": 0.16825396825396827, + "grad_norm": 0.30999752344647463, + "learning_rate": 2.7816901408450706e-05, + "loss": 0.6421, + "step": 159 + }, + { + "epoch": 0.1693121693121693, + "grad_norm": 0.3743307355482593, + "learning_rate": 2.7992957746478874e-05, + "loss": 0.795, + "step": 160 + }, + { + "epoch": 0.17037037037037037, + "grad_norm": 0.38423928807562113, + "learning_rate": 2.8169014084507046e-05, + "loss": 0.7782, + "step": 161 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 0.4125835959551516, + "learning_rate": 2.8345070422535215e-05, + "loss": 0.6243, + "step": 162 + }, + { + "epoch": 0.1724867724867725, + "grad_norm": 0.33067536279547244, + "learning_rate": 2.8521126760563384e-05, + "loss": 0.6739, + "step": 163 + }, + { + "epoch": 0.17354497354497356, + "grad_norm": 0.39452863321092113, + "learning_rate": 2.8697183098591552e-05, + "loss": 0.6706, + "step": 164 + }, + { + "epoch": 0.1746031746031746, + "grad_norm": 0.33341809821413604, + "learning_rate": 2.887323943661972e-05, + "loss": 0.6624, + "step": 165 + }, + { + "epoch": 0.17566137566137566, + "grad_norm": 0.3999598309398766, + "learning_rate": 2.904929577464789e-05, + "loss": 0.7741, + "step": 166 + }, + { + "epoch": 0.17671957671957672, + "grad_norm": 0.348362123360907, + "learning_rate": 2.9225352112676058e-05, + "loss": 0.6416, + "step": 167 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.38051950011821134, + "learning_rate": 2.9401408450704227e-05, + "loss": 0.7498, + "step": 168 + }, + { + "epoch": 0.17883597883597885, + "grad_norm": 0.3398083012362421, + "learning_rate": 2.9577464788732395e-05, + "loss": 0.6188, + "step": 169 + }, + { + "epoch": 0.17989417989417988, + "grad_norm": 0.3263197338387835, + "learning_rate": 2.9753521126760564e-05, + "loss": 0.6523, + "step": 170 + }, + { + "epoch": 0.18095238095238095, + "grad_norm": 0.29768829451839607, + "learning_rate": 2.9929577464788733e-05, + "loss": 0.5983, + "step": 171 + }, + { + "epoch": 0.182010582010582, + "grad_norm": 0.3616844593383965, + "learning_rate": 3.01056338028169e-05, + "loss": 0.6778, + "step": 172 + }, + { + "epoch": 0.18306878306878308, + "grad_norm": 0.32445494168670697, + "learning_rate": 3.028169014084507e-05, + "loss": 0.5868, + "step": 173 + }, + { + "epoch": 0.18412698412698414, + "grad_norm": 0.3744637982463626, + "learning_rate": 3.045774647887324e-05, + "loss": 0.5966, + "step": 174 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.34514886886805735, + "learning_rate": 3.063380281690141e-05, + "loss": 0.735, + "step": 175 + }, + { + "epoch": 0.18624338624338624, + "grad_norm": 0.3614006619260275, + "learning_rate": 3.0809859154929576e-05, + "loss": 0.6083, + "step": 176 + }, + { + "epoch": 0.1873015873015873, + "grad_norm": 0.34851465059010306, + "learning_rate": 3.0985915492957744e-05, + "loss": 0.6449, + "step": 177 + }, + { + "epoch": 0.18835978835978837, + "grad_norm": 0.3083459665743903, + "learning_rate": 3.116197183098591e-05, + "loss": 0.6277, + "step": 178 + }, + { + "epoch": 0.18941798941798943, + "grad_norm": 0.34720965786750074, + "learning_rate": 3.133802816901409e-05, + "loss": 0.6329, + "step": 179 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.358285604869434, + "learning_rate": 3.151408450704226e-05, + "loss": 0.6505, + "step": 180 + }, + { + "epoch": 0.19153439153439153, + "grad_norm": 0.34474320283746784, + "learning_rate": 3.1690140845070426e-05, + "loss": 0.6319, + "step": 181 + }, + { + "epoch": 0.1925925925925926, + "grad_norm": 0.3895257488548781, + "learning_rate": 3.1866197183098594e-05, + "loss": 0.8262, + "step": 182 + }, + { + "epoch": 0.19365079365079366, + "grad_norm": 0.3457383269254632, + "learning_rate": 3.204225352112676e-05, + "loss": 0.6699, + "step": 183 + }, + { + "epoch": 0.19470899470899472, + "grad_norm": 0.33258526228041074, + "learning_rate": 3.221830985915493e-05, + "loss": 0.6082, + "step": 184 + }, + { + "epoch": 0.19576719576719576, + "grad_norm": 0.352563011531979, + "learning_rate": 3.23943661971831e-05, + "loss": 0.6628, + "step": 185 + }, + { + "epoch": 0.19682539682539682, + "grad_norm": 0.3320170771366527, + "learning_rate": 3.257042253521127e-05, + "loss": 0.6525, + "step": 186 + }, + { + "epoch": 0.19788359788359788, + "grad_norm": 0.32318375497210705, + "learning_rate": 3.274647887323944e-05, + "loss": 0.5735, + "step": 187 + }, + { + "epoch": 0.19894179894179895, + "grad_norm": 0.3270203777340338, + "learning_rate": 3.2922535211267606e-05, + "loss": 0.5571, + "step": 188 + }, + { + "epoch": 0.2, + "grad_norm": 0.3469279421040238, + "learning_rate": 3.3098591549295775e-05, + "loss": 0.6787, + "step": 189 + }, + { + "epoch": 0.20105820105820105, + "grad_norm": 0.36765912576607546, + "learning_rate": 3.327464788732394e-05, + "loss": 0.7363, + "step": 190 + }, + { + "epoch": 0.2021164021164021, + "grad_norm": 0.34672109468153317, + "learning_rate": 3.345070422535211e-05, + "loss": 0.637, + "step": 191 + }, + { + "epoch": 0.20317460317460317, + "grad_norm": 0.38433667329288207, + "learning_rate": 3.362676056338028e-05, + "loss": 0.5852, + "step": 192 + }, + { + "epoch": 0.20423280423280424, + "grad_norm": 0.38599947669557544, + "learning_rate": 3.380281690140845e-05, + "loss": 0.6301, + "step": 193 + }, + { + "epoch": 0.2052910052910053, + "grad_norm": 0.40046349554165817, + "learning_rate": 3.397887323943662e-05, + "loss": 0.6748, + "step": 194 + }, + { + "epoch": 0.20634920634920634, + "grad_norm": 0.3979623426535507, + "learning_rate": 3.4154929577464786e-05, + "loss": 0.671, + "step": 195 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 0.40709279738188864, + "learning_rate": 3.4330985915492955e-05, + "loss": 0.6654, + "step": 196 + }, + { + "epoch": 0.20846560846560847, + "grad_norm": 0.36359801057653723, + "learning_rate": 3.450704225352113e-05, + "loss": 0.6099, + "step": 197 + }, + { + "epoch": 0.20952380952380953, + "grad_norm": 0.38002929487320863, + "learning_rate": 3.46830985915493e-05, + "loss": 0.7158, + "step": 198 + }, + { + "epoch": 0.2105820105820106, + "grad_norm": 0.34947905555847264, + "learning_rate": 3.485915492957747e-05, + "loss": 0.5947, + "step": 199 + }, + { + "epoch": 0.21164021164021163, + "grad_norm": 0.3671022867710289, + "learning_rate": 3.5035211267605636e-05, + "loss": 0.5809, + "step": 200 + }, + { + "epoch": 0.2126984126984127, + "grad_norm": 0.4001809161609547, + "learning_rate": 3.5211267605633805e-05, + "loss": 0.6605, + "step": 201 + }, + { + "epoch": 0.21375661375661376, + "grad_norm": 0.39279284072815546, + "learning_rate": 3.538732394366197e-05, + "loss": 0.6068, + "step": 202 + }, + { + "epoch": 0.21481481481481482, + "grad_norm": 0.33271126101318277, + "learning_rate": 3.556338028169014e-05, + "loss": 0.5607, + "step": 203 + }, + { + "epoch": 0.21587301587301588, + "grad_norm": 0.443314082948306, + "learning_rate": 3.573943661971831e-05, + "loss": 0.6927, + "step": 204 + }, + { + "epoch": 0.21693121693121692, + "grad_norm": 0.31502780054757173, + "learning_rate": 3.5915492957746486e-05, + "loss": 0.6896, + "step": 205 + }, + { + "epoch": 0.21798941798941798, + "grad_norm": 0.38688454724207, + "learning_rate": 3.6091549295774655e-05, + "loss": 0.604, + "step": 206 + }, + { + "epoch": 0.21904761904761905, + "grad_norm": 0.3312260584034657, + "learning_rate": 3.626760563380282e-05, + "loss": 0.6305, + "step": 207 + }, + { + "epoch": 0.2201058201058201, + "grad_norm": 0.34099970700466303, + "learning_rate": 3.644366197183099e-05, + "loss": 0.6235, + "step": 208 + }, + { + "epoch": 0.22116402116402117, + "grad_norm": 0.35741901452146296, + "learning_rate": 3.661971830985916e-05, + "loss": 0.7602, + "step": 209 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.3650578278726511, + "learning_rate": 3.679577464788733e-05, + "loss": 0.6775, + "step": 210 + }, + { + "epoch": 0.22328042328042327, + "grad_norm": 0.32198145647198445, + "learning_rate": 3.69718309859155e-05, + "loss": 0.6514, + "step": 211 + }, + { + "epoch": 0.22433862433862434, + "grad_norm": 0.34345547766950246, + "learning_rate": 3.7147887323943666e-05, + "loss": 0.5875, + "step": 212 + }, + { + "epoch": 0.2253968253968254, + "grad_norm": 0.274703336883991, + "learning_rate": 3.7323943661971835e-05, + "loss": 0.5498, + "step": 213 + }, + { + "epoch": 0.22645502645502646, + "grad_norm": 0.34787089300375457, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.634, + "step": 214 + }, + { + "epoch": 0.2275132275132275, + "grad_norm": 0.32858837285549813, + "learning_rate": 3.767605633802817e-05, + "loss": 0.7155, + "step": 215 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.3549541250892209, + "learning_rate": 3.785211267605634e-05, + "loss": 0.658, + "step": 216 + }, + { + "epoch": 0.22962962962962963, + "grad_norm": 0.3726458256752988, + "learning_rate": 3.802816901408451e-05, + "loss": 0.6604, + "step": 217 + }, + { + "epoch": 0.2306878306878307, + "grad_norm": 0.3698559943959681, + "learning_rate": 3.820422535211268e-05, + "loss": 0.7083, + "step": 218 + }, + { + "epoch": 0.23174603174603176, + "grad_norm": 0.3461402136268809, + "learning_rate": 3.8380281690140847e-05, + "loss": 0.6028, + "step": 219 + }, + { + "epoch": 0.2328042328042328, + "grad_norm": 0.36882537497597867, + "learning_rate": 3.8556338028169015e-05, + "loss": 0.5933, + "step": 220 + }, + { + "epoch": 0.23386243386243386, + "grad_norm": 0.32269501706477743, + "learning_rate": 3.8732394366197184e-05, + "loss": 0.5564, + "step": 221 + }, + { + "epoch": 0.23492063492063492, + "grad_norm": 0.40676474989986633, + "learning_rate": 3.890845070422535e-05, + "loss": 0.5672, + "step": 222 + }, + { + "epoch": 0.23597883597883598, + "grad_norm": 0.31466605195623004, + "learning_rate": 3.908450704225352e-05, + "loss": 0.6466, + "step": 223 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 0.4176582469387856, + "learning_rate": 3.926056338028169e-05, + "loss": 0.588, + "step": 224 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.3300399938594373, + "learning_rate": 3.943661971830986e-05, + "loss": 0.653, + "step": 225 + }, + { + "epoch": 0.23915343915343915, + "grad_norm": 0.37714569544063986, + "learning_rate": 3.9612676056338034e-05, + "loss": 0.5, + "step": 226 + }, + { + "epoch": 0.2402116402116402, + "grad_norm": 0.40552770928374765, + "learning_rate": 3.97887323943662e-05, + "loss": 0.6434, + "step": 227 + }, + { + "epoch": 0.24126984126984127, + "grad_norm": 0.4636367646766144, + "learning_rate": 3.996478873239437e-05, + "loss": 0.675, + "step": 228 + }, + { + "epoch": 0.24232804232804234, + "grad_norm": 0.4172347857566623, + "learning_rate": 4.014084507042254e-05, + "loss": 0.6065, + "step": 229 + }, + { + "epoch": 0.24338624338624337, + "grad_norm": 0.5288156764553862, + "learning_rate": 4.031690140845071e-05, + "loss": 0.6875, + "step": 230 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.36944896552596407, + "learning_rate": 4.049295774647888e-05, + "loss": 0.6154, + "step": 231 + }, + { + "epoch": 0.2455026455026455, + "grad_norm": 0.4691395217338354, + "learning_rate": 4.0669014084507045e-05, + "loss": 0.7015, + "step": 232 + }, + { + "epoch": 0.24656084656084656, + "grad_norm": 0.35238179145757065, + "learning_rate": 4.0845070422535214e-05, + "loss": 0.6848, + "step": 233 + }, + { + "epoch": 0.24761904761904763, + "grad_norm": 0.483662014218935, + "learning_rate": 4.102112676056338e-05, + "loss": 0.6265, + "step": 234 + }, + { + "epoch": 0.24867724867724866, + "grad_norm": 0.3765557351707901, + "learning_rate": 4.119718309859155e-05, + "loss": 0.685, + "step": 235 + }, + { + "epoch": 0.24973544973544973, + "grad_norm": 0.43161674678571094, + "learning_rate": 4.137323943661972e-05, + "loss": 0.5812, + "step": 236 + }, + { + "epoch": 0.2507936507936508, + "grad_norm": 0.3554164355775738, + "learning_rate": 4.154929577464789e-05, + "loss": 0.5958, + "step": 237 + }, + { + "epoch": 0.2518518518518518, + "grad_norm": 0.48752122898965977, + "learning_rate": 4.172535211267606e-05, + "loss": 0.6589, + "step": 238 + }, + { + "epoch": 0.2529100529100529, + "grad_norm": 0.41293508091850323, + "learning_rate": 4.1901408450704226e-05, + "loss": 0.6455, + "step": 239 + }, + { + "epoch": 0.25396825396825395, + "grad_norm": 0.4459574033198097, + "learning_rate": 4.2077464788732394e-05, + "loss": 0.696, + "step": 240 + }, + { + "epoch": 0.25502645502645505, + "grad_norm": 0.5261242261127165, + "learning_rate": 4.225352112676056e-05, + "loss": 0.612, + "step": 241 + }, + { + "epoch": 0.2560846560846561, + "grad_norm": 0.4137102192090692, + "learning_rate": 4.242957746478873e-05, + "loss": 0.5578, + "step": 242 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 0.570279904177202, + "learning_rate": 4.26056338028169e-05, + "loss": 0.6889, + "step": 243 + }, + { + "epoch": 0.2582010582010582, + "grad_norm": 0.5430310414509273, + "learning_rate": 4.278169014084507e-05, + "loss": 0.6589, + "step": 244 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.441503758930117, + "learning_rate": 4.295774647887324e-05, + "loss": 0.6545, + "step": 245 + }, + { + "epoch": 0.26031746031746034, + "grad_norm": 0.5384182838717864, + "learning_rate": 4.3133802816901406e-05, + "loss": 0.6461, + "step": 246 + }, + { + "epoch": 0.2613756613756614, + "grad_norm": 0.3999205740390739, + "learning_rate": 4.3309859154929575e-05, + "loss": 0.589, + "step": 247 + }, + { + "epoch": 0.2624338624338624, + "grad_norm": 0.4747833562137057, + "learning_rate": 4.348591549295774e-05, + "loss": 0.6146, + "step": 248 + }, + { + "epoch": 0.2634920634920635, + "grad_norm": 0.576119057299997, + "learning_rate": 4.366197183098591e-05, + "loss": 0.6782, + "step": 249 + }, + { + "epoch": 0.26455026455026454, + "grad_norm": 0.5322525365860533, + "learning_rate": 4.383802816901409e-05, + "loss": 0.5898, + "step": 250 + }, + { + "epoch": 0.2656084656084656, + "grad_norm": 0.5225758045153239, + "learning_rate": 4.4014084507042256e-05, + "loss": 0.5973, + "step": 251 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.1554341236749963, + "learning_rate": 4.4190140845070424e-05, + "loss": 0.4985, + "step": 252 + }, + { + "epoch": 0.2677248677248677, + "grad_norm": 0.47852793764541357, + "learning_rate": 4.436619718309859e-05, + "loss": 0.5225, + "step": 253 + }, + { + "epoch": 0.2687830687830688, + "grad_norm": 0.41663440733827256, + "learning_rate": 4.454225352112676e-05, + "loss": 0.6353, + "step": 254 + }, + { + "epoch": 0.2698412698412698, + "grad_norm": 0.5546294618558951, + "learning_rate": 4.471830985915493e-05, + "loss": 0.6392, + "step": 255 + }, + { + "epoch": 0.2708994708994709, + "grad_norm": 0.37390131385634334, + "learning_rate": 4.48943661971831e-05, + "loss": 0.69, + "step": 256 + }, + { + "epoch": 0.27195767195767195, + "grad_norm": 1.3249897376641209, + "learning_rate": 4.507042253521127e-05, + "loss": 0.5528, + "step": 257 + }, + { + "epoch": 0.273015873015873, + "grad_norm": 0.4769685334018541, + "learning_rate": 4.5246478873239436e-05, + "loss": 0.5465, + "step": 258 + }, + { + "epoch": 0.2740740740740741, + "grad_norm": 0.639538521986945, + "learning_rate": 4.542253521126761e-05, + "loss": 0.6834, + "step": 259 + }, + { + "epoch": 0.2751322751322751, + "grad_norm": 0.5219205657799278, + "learning_rate": 4.559859154929578e-05, + "loss": 0.5925, + "step": 260 + }, + { + "epoch": 0.2761904761904762, + "grad_norm": 0.47303904730740937, + "learning_rate": 4.577464788732395e-05, + "loss": 0.6291, + "step": 261 + }, + { + "epoch": 0.27724867724867724, + "grad_norm": 0.4124587679203123, + "learning_rate": 4.595070422535212e-05, + "loss": 0.5333, + "step": 262 + }, + { + "epoch": 0.2783068783068783, + "grad_norm": 0.39574829022247976, + "learning_rate": 4.6126760563380286e-05, + "loss": 0.6118, + "step": 263 + }, + { + "epoch": 0.27936507936507937, + "grad_norm": 0.4906299930682528, + "learning_rate": 4.6302816901408455e-05, + "loss": 0.5492, + "step": 264 + }, + { + "epoch": 0.2804232804232804, + "grad_norm": 0.45541184091560766, + "learning_rate": 4.647887323943662e-05, + "loss": 0.6135, + "step": 265 + }, + { + "epoch": 0.2814814814814815, + "grad_norm": 0.3569305414446637, + "learning_rate": 4.665492957746479e-05, + "loss": 0.5374, + "step": 266 + }, + { + "epoch": 0.28253968253968254, + "grad_norm": 0.43109469741112555, + "learning_rate": 4.683098591549296e-05, + "loss": 0.6969, + "step": 267 + }, + { + "epoch": 0.28359788359788357, + "grad_norm": 0.34850457997668804, + "learning_rate": 4.700704225352113e-05, + "loss": 0.5812, + "step": 268 + }, + { + "epoch": 0.28465608465608466, + "grad_norm": 0.43017148890786167, + "learning_rate": 4.71830985915493e-05, + "loss": 0.6669, + "step": 269 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.3979891127651658, + "learning_rate": 4.7359154929577466e-05, + "loss": 0.5677, + "step": 270 + }, + { + "epoch": 0.2867724867724868, + "grad_norm": 0.5535072160058837, + "learning_rate": 4.7535211267605635e-05, + "loss": 0.6401, + "step": 271 + }, + { + "epoch": 0.2878306878306878, + "grad_norm": 0.4788371326408822, + "learning_rate": 4.7711267605633804e-05, + "loss": 0.6775, + "step": 272 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.37053522739872213, + "learning_rate": 4.788732394366197e-05, + "loss": 0.6266, + "step": 273 + }, + { + "epoch": 0.28994708994708995, + "grad_norm": 0.44280982619784764, + "learning_rate": 4.806338028169015e-05, + "loss": 0.6108, + "step": 274 + }, + { + "epoch": 0.291005291005291, + "grad_norm": 0.34292181255238274, + "learning_rate": 4.8239436619718316e-05, + "loss": 0.5563, + "step": 275 + }, + { + "epoch": 0.2920634920634921, + "grad_norm": 0.4678353671213705, + "learning_rate": 4.8415492957746485e-05, + "loss": 0.6338, + "step": 276 + }, + { + "epoch": 0.2931216931216931, + "grad_norm": 0.3603311974292169, + "learning_rate": 4.8591549295774653e-05, + "loss": 0.565, + "step": 277 + }, + { + "epoch": 0.29417989417989415, + "grad_norm": 0.4616974029069646, + "learning_rate": 4.876760563380282e-05, + "loss": 0.5827, + "step": 278 + }, + { + "epoch": 0.29523809523809524, + "grad_norm": 0.4417612006792997, + "learning_rate": 4.894366197183099e-05, + "loss": 0.7125, + "step": 279 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.6134549574544247, + "learning_rate": 4.911971830985916e-05, + "loss": 0.6591, + "step": 280 + }, + { + "epoch": 0.29735449735449737, + "grad_norm": 0.37720738345155475, + "learning_rate": 4.929577464788733e-05, + "loss": 0.5585, + "step": 281 + }, + { + "epoch": 0.2984126984126984, + "grad_norm": 0.4237444553725601, + "learning_rate": 4.9471830985915497e-05, + "loss": 0.6572, + "step": 282 + }, + { + "epoch": 0.29947089947089944, + "grad_norm": 0.36275418041559837, + "learning_rate": 4.9647887323943665e-05, + "loss": 0.6876, + "step": 283 + }, + { + "epoch": 0.30052910052910053, + "grad_norm": 0.5213976463565436, + "learning_rate": 4.9823943661971834e-05, + "loss": 0.6989, + "step": 284 + }, + { + "epoch": 0.30158730158730157, + "grad_norm": 0.3570263641303588, + "learning_rate": 5e-05, + "loss": 0.6558, + "step": 285 + }, + { + "epoch": 0.30264550264550266, + "grad_norm": 0.4185098108573085, + "learning_rate": 4.998039984319875e-05, + "loss": 0.5007, + "step": 286 + }, + { + "epoch": 0.3037037037037037, + "grad_norm": 0.3850834324831248, + "learning_rate": 4.9960799686397494e-05, + "loss": 0.5663, + "step": 287 + }, + { + "epoch": 0.3047619047619048, + "grad_norm": 0.42577227525237116, + "learning_rate": 4.994119952959624e-05, + "loss": 0.6345, + "step": 288 + }, + { + "epoch": 0.3058201058201058, + "grad_norm": 0.365037307220747, + "learning_rate": 4.9921599372794986e-05, + "loss": 0.6379, + "step": 289 + }, + { + "epoch": 0.30687830687830686, + "grad_norm": 0.4118237945823454, + "learning_rate": 4.990199921599373e-05, + "loss": 0.634, + "step": 290 + }, + { + "epoch": 0.30793650793650795, + "grad_norm": 0.33860288653166326, + "learning_rate": 4.988239905919248e-05, + "loss": 0.5576, + "step": 291 + }, + { + "epoch": 0.308994708994709, + "grad_norm": 0.4301618112703962, + "learning_rate": 4.986279890239122e-05, + "loss": 0.5168, + "step": 292 + }, + { + "epoch": 0.3100529100529101, + "grad_norm": 0.3153318738893075, + "learning_rate": 4.984319874558997e-05, + "loss": 0.5811, + "step": 293 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.363675255321121, + "learning_rate": 4.982359858878871e-05, + "loss": 0.5799, + "step": 294 + }, + { + "epoch": 0.31216931216931215, + "grad_norm": 0.34691937369358705, + "learning_rate": 4.980399843198746e-05, + "loss": 0.5285, + "step": 295 + }, + { + "epoch": 0.31322751322751324, + "grad_norm": 0.36271401144515686, + "learning_rate": 4.9784398275186204e-05, + "loss": 0.5956, + "step": 296 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 0.39889753761155844, + "learning_rate": 4.9764798118384946e-05, + "loss": 0.6149, + "step": 297 + }, + { + "epoch": 0.31534391534391537, + "grad_norm": 1.4954635983258886, + "learning_rate": 4.9745197961583695e-05, + "loss": 0.6453, + "step": 298 + }, + { + "epoch": 0.3164021164021164, + "grad_norm": 0.453710769398266, + "learning_rate": 4.9725597804782445e-05, + "loss": 0.6233, + "step": 299 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3324927531881442, + "learning_rate": 4.970599764798119e-05, + "loss": 0.5543, + "step": 300 + }, + { + "epoch": 0.31851851851851853, + "grad_norm": 0.3623203424678947, + "learning_rate": 4.968639749117993e-05, + "loss": 0.6053, + "step": 301 + }, + { + "epoch": 0.31957671957671957, + "grad_norm": 0.37974058426507107, + "learning_rate": 4.966679733437868e-05, + "loss": 0.6463, + "step": 302 + }, + { + "epoch": 0.32063492063492066, + "grad_norm": 0.37093076432402033, + "learning_rate": 4.964719717757742e-05, + "loss": 0.6168, + "step": 303 + }, + { + "epoch": 0.3216931216931217, + "grad_norm": 0.4056043538201469, + "learning_rate": 4.962759702077617e-05, + "loss": 0.7495, + "step": 304 + }, + { + "epoch": 0.32275132275132273, + "grad_norm": 0.328152358631735, + "learning_rate": 4.960799686397491e-05, + "loss": 0.5899, + "step": 305 + }, + { + "epoch": 0.3238095238095238, + "grad_norm": 0.35175655000013734, + "learning_rate": 4.958839670717366e-05, + "loss": 0.5318, + "step": 306 + }, + { + "epoch": 0.32486772486772486, + "grad_norm": 0.3569933392052981, + "learning_rate": 4.9568796550372405e-05, + "loss": 0.6631, + "step": 307 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 0.37456493527809115, + "learning_rate": 4.9549196393571154e-05, + "loss": 0.7267, + "step": 308 + }, + { + "epoch": 0.326984126984127, + "grad_norm": 6.48230463042188, + "learning_rate": 4.9529596236769897e-05, + "loss": 0.9326, + "step": 309 + }, + { + "epoch": 0.328042328042328, + "grad_norm": 0.597786693643111, + "learning_rate": 4.950999607996864e-05, + "loss": 0.6035, + "step": 310 + }, + { + "epoch": 0.3291005291005291, + "grad_norm": 1.0072574874604938, + "learning_rate": 4.949039592316739e-05, + "loss": 0.6678, + "step": 311 + }, + { + "epoch": 0.33015873015873015, + "grad_norm": 2.015193754844906, + "learning_rate": 4.947079576636614e-05, + "loss": 0.6074, + "step": 312 + }, + { + "epoch": 0.33121693121693124, + "grad_norm": 0.9965912669595596, + "learning_rate": 4.945119560956487e-05, + "loss": 0.5941, + "step": 313 + }, + { + "epoch": 0.3322751322751323, + "grad_norm": 0.6804805860670657, + "learning_rate": 4.943159545276362e-05, + "loss": 0.6363, + "step": 314 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.6017533613165538, + "learning_rate": 4.941199529596237e-05, + "loss": 0.6276, + "step": 315 + }, + { + "epoch": 0.3343915343915344, + "grad_norm": 0.704554658779125, + "learning_rate": 4.939239513916112e-05, + "loss": 0.6388, + "step": 316 + }, + { + "epoch": 0.33544973544973544, + "grad_norm": 0.5032673111541713, + "learning_rate": 4.937279498235986e-05, + "loss": 0.5389, + "step": 317 + }, + { + "epoch": 0.33650793650793653, + "grad_norm": 0.5015885516895849, + "learning_rate": 4.9353194825558606e-05, + "loss": 0.5515, + "step": 318 + }, + { + "epoch": 0.33756613756613757, + "grad_norm": 0.5097653076876837, + "learning_rate": 4.9333594668757355e-05, + "loss": 0.5483, + "step": 319 + }, + { + "epoch": 0.3386243386243386, + "grad_norm": 0.689407061137572, + "learning_rate": 4.93139945119561e-05, + "loss": 0.6595, + "step": 320 + }, + { + "epoch": 0.3396825396825397, + "grad_norm": 1.3956902752361948, + "learning_rate": 4.929439435515485e-05, + "loss": 0.6863, + "step": 321 + }, + { + "epoch": 0.34074074074074073, + "grad_norm": 0.6477224984591293, + "learning_rate": 4.927479419835359e-05, + "loss": 0.6719, + "step": 322 + }, + { + "epoch": 0.3417989417989418, + "grad_norm": 0.4791685078247184, + "learning_rate": 4.925519404155233e-05, + "loss": 0.6532, + "step": 323 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.7647455686764629, + "learning_rate": 4.923559388475108e-05, + "loss": 0.5633, + "step": 324 + }, + { + "epoch": 0.3439153439153439, + "grad_norm": 0.540162982463785, + "learning_rate": 4.921599372794983e-05, + "loss": 0.6154, + "step": 325 + }, + { + "epoch": 0.344973544973545, + "grad_norm": 0.5987926929939865, + "learning_rate": 4.9196393571148566e-05, + "loss": 0.5485, + "step": 326 + }, + { + "epoch": 0.346031746031746, + "grad_norm": 0.47012692994553495, + "learning_rate": 4.9176793414347316e-05, + "loss": 0.6356, + "step": 327 + }, + { + "epoch": 0.3470899470899471, + "grad_norm": 0.6190282717604556, + "learning_rate": 4.9157193257546065e-05, + "loss": 0.737, + "step": 328 + }, + { + "epoch": 0.34814814814814815, + "grad_norm": 0.48951022131560773, + "learning_rate": 4.9137593100744814e-05, + "loss": 0.6171, + "step": 329 + }, + { + "epoch": 0.3492063492063492, + "grad_norm": 0.5704216168233464, + "learning_rate": 4.911799294394355e-05, + "loss": 0.6843, + "step": 330 + }, + { + "epoch": 0.3502645502645503, + "grad_norm": 0.3828305070852499, + "learning_rate": 4.90983927871423e-05, + "loss": 0.6277, + "step": 331 + }, + { + "epoch": 0.3513227513227513, + "grad_norm": 0.4899467959752788, + "learning_rate": 4.907879263034105e-05, + "loss": 0.5997, + "step": 332 + }, + { + "epoch": 0.3523809523809524, + "grad_norm": 0.4491699388666299, + "learning_rate": 4.905919247353979e-05, + "loss": 0.5925, + "step": 333 + }, + { + "epoch": 0.35343915343915344, + "grad_norm": 0.425441565694508, + "learning_rate": 4.903959231673853e-05, + "loss": 0.5301, + "step": 334 + }, + { + "epoch": 0.3544973544973545, + "grad_norm": 0.45512477300228416, + "learning_rate": 4.901999215993728e-05, + "loss": 0.5845, + "step": 335 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 10.638459759457483, + "learning_rate": 4.9000392003136025e-05, + "loss": 0.7328, + "step": 336 + }, + { + "epoch": 0.3566137566137566, + "grad_norm": 0.6506851562077983, + "learning_rate": 4.8980791846334774e-05, + "loss": 0.5733, + "step": 337 + }, + { + "epoch": 0.3576719576719577, + "grad_norm": 0.3597461888788251, + "learning_rate": 4.896119168953352e-05, + "loss": 0.5873, + "step": 338 + }, + { + "epoch": 0.35873015873015873, + "grad_norm": 0.489755609131464, + "learning_rate": 4.894159153273226e-05, + "loss": 0.5659, + "step": 339 + }, + { + "epoch": 0.35978835978835977, + "grad_norm": 0.4683550804241493, + "learning_rate": 4.892199137593101e-05, + "loss": 0.62, + "step": 340 + }, + { + "epoch": 0.36084656084656086, + "grad_norm": 0.5994458890657961, + "learning_rate": 4.890239121912976e-05, + "loss": 0.6241, + "step": 341 + }, + { + "epoch": 0.3619047619047619, + "grad_norm": 0.36417066783567376, + "learning_rate": 4.88827910623285e-05, + "loss": 0.5937, + "step": 342 + }, + { + "epoch": 0.362962962962963, + "grad_norm": 0.5283081961653806, + "learning_rate": 4.886319090552724e-05, + "loss": 0.6184, + "step": 343 + }, + { + "epoch": 0.364021164021164, + "grad_norm": 0.39305815236656894, + "learning_rate": 4.884359074872599e-05, + "loss": 0.6288, + "step": 344 + }, + { + "epoch": 0.36507936507936506, + "grad_norm": 0.45312073034904926, + "learning_rate": 4.882399059192474e-05, + "loss": 0.6075, + "step": 345 + }, + { + "epoch": 0.36613756613756615, + "grad_norm": 0.4058520594663374, + "learning_rate": 4.8804390435123484e-05, + "loss": 0.694, + "step": 346 + }, + { + "epoch": 0.3671957671957672, + "grad_norm": 0.5002449512089666, + "learning_rate": 4.8784790278322226e-05, + "loss": 0.5748, + "step": 347 + }, + { + "epoch": 0.3682539682539683, + "grad_norm": 0.4117436134851183, + "learning_rate": 4.8765190121520976e-05, + "loss": 0.6376, + "step": 348 + }, + { + "epoch": 0.3693121693121693, + "grad_norm": 0.502404236336507, + "learning_rate": 4.874558996471972e-05, + "loss": 0.6094, + "step": 349 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.40455105073614056, + "learning_rate": 4.872598980791847e-05, + "loss": 0.6217, + "step": 350 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.35704360652012157, + "learning_rate": 4.870638965111721e-05, + "loss": 0.5024, + "step": 351 + }, + { + "epoch": 0.3724867724867725, + "grad_norm": 0.4264916243948178, + "learning_rate": 4.868678949431595e-05, + "loss": 0.5388, + "step": 352 + }, + { + "epoch": 0.37354497354497357, + "grad_norm": 0.3327240004352966, + "learning_rate": 4.86671893375147e-05, + "loss": 0.5935, + "step": 353 + }, + { + "epoch": 0.3746031746031746, + "grad_norm": 0.41835670277009634, + "learning_rate": 4.864758918071345e-05, + "loss": 0.635, + "step": 354 + }, + { + "epoch": 0.37566137566137564, + "grad_norm": 0.3307152093175963, + "learning_rate": 4.862798902391219e-05, + "loss": 0.618, + "step": 355 + }, + { + "epoch": 0.37671957671957673, + "grad_norm": 0.36724207232476874, + "learning_rate": 4.8608388867110936e-05, + "loss": 0.5644, + "step": 356 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.33194305741881575, + "learning_rate": 4.8588788710309685e-05, + "loss": 0.5842, + "step": 357 + }, + { + "epoch": 0.37883597883597886, + "grad_norm": 0.35340969186734694, + "learning_rate": 4.8569188553508434e-05, + "loss": 0.6276, + "step": 358 + }, + { + "epoch": 0.3798941798941799, + "grad_norm": 0.3326126922758559, + "learning_rate": 4.854958839670718e-05, + "loss": 0.6179, + "step": 359 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.3405811498157609, + "learning_rate": 4.852998823990592e-05, + "loss": 0.6401, + "step": 360 + }, + { + "epoch": 0.382010582010582, + "grad_norm": 0.3226096419931992, + "learning_rate": 4.851038808310467e-05, + "loss": 0.605, + "step": 361 + }, + { + "epoch": 0.38306878306878306, + "grad_norm": 0.32239461661768787, + "learning_rate": 4.849078792630341e-05, + "loss": 0.6507, + "step": 362 + }, + { + "epoch": 0.38412698412698415, + "grad_norm": 0.3484471583643884, + "learning_rate": 4.847118776950216e-05, + "loss": 0.605, + "step": 363 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 0.3112894013568265, + "learning_rate": 4.84515876127009e-05, + "loss": 0.5174, + "step": 364 + }, + { + "epoch": 0.3862433862433862, + "grad_norm": 0.2899220938284599, + "learning_rate": 4.843198745589965e-05, + "loss": 0.5094, + "step": 365 + }, + { + "epoch": 0.3873015873015873, + "grad_norm": 0.28917708457385394, + "learning_rate": 4.8412387299098395e-05, + "loss": 0.5609, + "step": 366 + }, + { + "epoch": 0.38835978835978835, + "grad_norm": 0.3538788286573949, + "learning_rate": 4.8392787142297144e-05, + "loss": 0.6225, + "step": 367 + }, + { + "epoch": 0.38941798941798944, + "grad_norm": 0.3012030881102022, + "learning_rate": 4.8373186985495886e-05, + "loss": 0.6549, + "step": 368 + }, + { + "epoch": 0.3904761904761905, + "grad_norm": 0.3265061631766346, + "learning_rate": 4.835358682869463e-05, + "loss": 0.5666, + "step": 369 + }, + { + "epoch": 0.3915343915343915, + "grad_norm": 0.2683583970944115, + "learning_rate": 4.833398667189338e-05, + "loss": 0.5693, + "step": 370 + }, + { + "epoch": 0.3925925925925926, + "grad_norm": 0.3579867664845553, + "learning_rate": 4.831438651509213e-05, + "loss": 0.6117, + "step": 371 + }, + { + "epoch": 0.39365079365079364, + "grad_norm": 0.29023429064286466, + "learning_rate": 4.829478635829087e-05, + "loss": 0.6243, + "step": 372 + }, + { + "epoch": 0.39470899470899473, + "grad_norm": 0.30720865653544066, + "learning_rate": 4.827518620148961e-05, + "loss": 0.6137, + "step": 373 + }, + { + "epoch": 0.39576719576719577, + "grad_norm": 0.28341688168565876, + "learning_rate": 4.825558604468836e-05, + "loss": 0.6066, + "step": 374 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 0.27817987141750344, + "learning_rate": 4.8235985887887104e-05, + "loss": 0.523, + "step": 375 + }, + { + "epoch": 0.3978835978835979, + "grad_norm": 0.3145876613453268, + "learning_rate": 4.821638573108585e-05, + "loss": 0.6418, + "step": 376 + }, + { + "epoch": 0.39894179894179893, + "grad_norm": 0.3096978266574846, + "learning_rate": 4.8196785574284596e-05, + "loss": 0.6102, + "step": 377 + }, + { + "epoch": 0.4, + "grad_norm": 0.29312324806193996, + "learning_rate": 4.8177185417483345e-05, + "loss": 0.6043, + "step": 378 + }, + { + "epoch": 0.40105820105820106, + "grad_norm": 0.3215404710144968, + "learning_rate": 4.815758526068209e-05, + "loss": 0.5813, + "step": 379 + }, + { + "epoch": 0.4021164021164021, + "grad_norm": 0.2843918875735979, + "learning_rate": 4.813798510388084e-05, + "loss": 0.6212, + "step": 380 + }, + { + "epoch": 0.4031746031746032, + "grad_norm": 0.30037403275169, + "learning_rate": 4.811838494707958e-05, + "loss": 0.6729, + "step": 381 + }, + { + "epoch": 0.4042328042328042, + "grad_norm": 0.3207760435283416, + "learning_rate": 4.809878479027832e-05, + "loss": 0.55, + "step": 382 + }, + { + "epoch": 0.4052910052910053, + "grad_norm": 0.26503156107162107, + "learning_rate": 4.807918463347707e-05, + "loss": 0.5419, + "step": 383 + }, + { + "epoch": 0.40634920634920635, + "grad_norm": 0.2882065071584723, + "learning_rate": 4.805958447667582e-05, + "loss": 0.569, + "step": 384 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.29455047481787927, + "learning_rate": 4.8039984319874556e-05, + "loss": 0.5042, + "step": 385 + }, + { + "epoch": 0.4084656084656085, + "grad_norm": 0.29685194902902406, + "learning_rate": 4.8020384163073305e-05, + "loss": 0.5383, + "step": 386 + }, + { + "epoch": 0.4095238095238095, + "grad_norm": 0.376863384403299, + "learning_rate": 4.8000784006272054e-05, + "loss": 0.6392, + "step": 387 + }, + { + "epoch": 0.4105820105820106, + "grad_norm": 0.3371594913183494, + "learning_rate": 4.79811838494708e-05, + "loss": 0.661, + "step": 388 + }, + { + "epoch": 0.41164021164021164, + "grad_norm": 0.34032184447977354, + "learning_rate": 4.796158369266954e-05, + "loss": 0.5174, + "step": 389 + }, + { + "epoch": 0.4126984126984127, + "grad_norm": 0.3887738951246669, + "learning_rate": 4.794198353586829e-05, + "loss": 0.6675, + "step": 390 + }, + { + "epoch": 0.41375661375661377, + "grad_norm": 0.29671883772859686, + "learning_rate": 4.792238337906704e-05, + "loss": 0.5057, + "step": 391 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 0.35504708481760416, + "learning_rate": 4.790278322226578e-05, + "loss": 0.5723, + "step": 392 + }, + { + "epoch": 0.4158730158730159, + "grad_norm": 0.3187145224838173, + "learning_rate": 4.788318306546453e-05, + "loss": 0.5921, + "step": 393 + }, + { + "epoch": 0.41693121693121693, + "grad_norm": 0.296017951892389, + "learning_rate": 4.786358290866327e-05, + "loss": 0.5281, + "step": 394 + }, + { + "epoch": 0.41798941798941797, + "grad_norm": 0.39656850609570043, + "learning_rate": 4.7843982751862015e-05, + "loss": 0.6467, + "step": 395 + }, + { + "epoch": 0.41904761904761906, + "grad_norm": 0.32082406571324745, + "learning_rate": 4.7824382595060764e-05, + "loss": 0.5867, + "step": 396 + }, + { + "epoch": 0.4201058201058201, + "grad_norm": 0.4041020790707597, + "learning_rate": 4.780478243825951e-05, + "loss": 0.533, + "step": 397 + }, + { + "epoch": 0.4211640211640212, + "grad_norm": 0.38193752458595226, + "learning_rate": 4.778518228145825e-05, + "loss": 0.5414, + "step": 398 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.30383562921606366, + "learning_rate": 4.7765582124657e-05, + "loss": 0.5696, + "step": 399 + }, + { + "epoch": 0.42328042328042326, + "grad_norm": 0.3922882135971307, + "learning_rate": 4.774598196785575e-05, + "loss": 0.5608, + "step": 400 + }, + { + "epoch": 0.42433862433862435, + "grad_norm": 0.43007597049774227, + "learning_rate": 4.772638181105449e-05, + "loss": 0.5647, + "step": 401 + }, + { + "epoch": 0.4253968253968254, + "grad_norm": 0.5686439579885741, + "learning_rate": 4.770678165425323e-05, + "loss": 0.4939, + "step": 402 + }, + { + "epoch": 0.4264550264550265, + "grad_norm": 0.36392633625996323, + "learning_rate": 4.768718149745198e-05, + "loss": 0.5443, + "step": 403 + }, + { + "epoch": 0.4275132275132275, + "grad_norm": 0.3272462847928772, + "learning_rate": 4.766758134065073e-05, + "loss": 0.5693, + "step": 404 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.3124002758364203, + "learning_rate": 4.7647981183849473e-05, + "loss": 0.6362, + "step": 405 + }, + { + "epoch": 0.42962962962962964, + "grad_norm": 0.29871765035322945, + "learning_rate": 4.7628381027048216e-05, + "loss": 0.5147, + "step": 406 + }, + { + "epoch": 0.4306878306878307, + "grad_norm": 0.3962457157308558, + "learning_rate": 4.7608780870246965e-05, + "loss": 0.5617, + "step": 407 + }, + { + "epoch": 0.43174603174603177, + "grad_norm": 0.334309491928329, + "learning_rate": 4.758918071344571e-05, + "loss": 0.5507, + "step": 408 + }, + { + "epoch": 0.4328042328042328, + "grad_norm": 0.3494500717283481, + "learning_rate": 4.756958055664446e-05, + "loss": 0.5789, + "step": 409 + }, + { + "epoch": 0.43386243386243384, + "grad_norm": 0.9776571014024243, + "learning_rate": 4.75499803998432e-05, + "loss": 0.5971, + "step": 410 + }, + { + "epoch": 0.43492063492063493, + "grad_norm": 0.36178604692099614, + "learning_rate": 4.753038024304194e-05, + "loss": 0.5824, + "step": 411 + }, + { + "epoch": 0.43597883597883597, + "grad_norm": 0.3701183639060816, + "learning_rate": 4.751078008624069e-05, + "loss": 0.6046, + "step": 412 + }, + { + "epoch": 0.43703703703703706, + "grad_norm": 0.37912182400136274, + "learning_rate": 4.749117992943944e-05, + "loss": 0.5887, + "step": 413 + }, + { + "epoch": 0.4380952380952381, + "grad_norm": 0.34637123340056375, + "learning_rate": 4.747157977263818e-05, + "loss": 0.5773, + "step": 414 + }, + { + "epoch": 0.43915343915343913, + "grad_norm": 0.4073065757199422, + "learning_rate": 4.7451979615836925e-05, + "loss": 0.5738, + "step": 415 + }, + { + "epoch": 0.4402116402116402, + "grad_norm": 0.3344005038216227, + "learning_rate": 4.7432379459035675e-05, + "loss": 0.5061, + "step": 416 + }, + { + "epoch": 0.44126984126984126, + "grad_norm": 0.3739197194136766, + "learning_rate": 4.7412779302234424e-05, + "loss": 0.585, + "step": 417 + }, + { + "epoch": 0.44232804232804235, + "grad_norm": 0.6438115315588397, + "learning_rate": 4.7393179145433166e-05, + "loss": 0.6128, + "step": 418 + }, + { + "epoch": 0.4433862433862434, + "grad_norm": 0.4877567334363792, + "learning_rate": 4.737357898863191e-05, + "loss": 0.6998, + "step": 419 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.33503924359280984, + "learning_rate": 4.735397883183066e-05, + "loss": 0.6582, + "step": 420 + }, + { + "epoch": 0.4455026455026455, + "grad_norm": 0.35412692478679975, + "learning_rate": 4.73343786750294e-05, + "loss": 0.5902, + "step": 421 + }, + { + "epoch": 0.44656084656084655, + "grad_norm": 0.32185468036018056, + "learning_rate": 4.731477851822815e-05, + "loss": 0.5647, + "step": 422 + }, + { + "epoch": 0.44761904761904764, + "grad_norm": 0.33447079814754294, + "learning_rate": 4.729517836142689e-05, + "loss": 0.5391, + "step": 423 + }, + { + "epoch": 0.4486772486772487, + "grad_norm": 5.115754194500417, + "learning_rate": 4.7275578204625635e-05, + "loss": 0.6673, + "step": 424 + }, + { + "epoch": 0.4497354497354497, + "grad_norm": 1.328092342445049, + "learning_rate": 4.7255978047824384e-05, + "loss": 0.7066, + "step": 425 + }, + { + "epoch": 0.4507936507936508, + "grad_norm": 0.40429664376112756, + "learning_rate": 4.7236377891023133e-05, + "loss": 0.6268, + "step": 426 + }, + { + "epoch": 0.45185185185185184, + "grad_norm": 0.35636211087696995, + "learning_rate": 4.7216777734221876e-05, + "loss": 0.5591, + "step": 427 + }, + { + "epoch": 0.45291005291005293, + "grad_norm": 0.8540612192195892, + "learning_rate": 4.719717757742062e-05, + "loss": 0.5389, + "step": 428 + }, + { + "epoch": 0.45396825396825397, + "grad_norm": 0.40756442415791055, + "learning_rate": 4.717757742061937e-05, + "loss": 0.5483, + "step": 429 + }, + { + "epoch": 0.455026455026455, + "grad_norm": 0.3970491214306376, + "learning_rate": 4.715797726381812e-05, + "loss": 0.6473, + "step": 430 + }, + { + "epoch": 0.4560846560846561, + "grad_norm": 0.33077010193886147, + "learning_rate": 4.713837710701686e-05, + "loss": 0.5264, + "step": 431 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37580554982943354, + "learning_rate": 4.71187769502156e-05, + "loss": 0.5222, + "step": 432 + }, + { + "epoch": 0.4582010582010582, + "grad_norm": 0.4135755906441715, + "learning_rate": 4.709917679341435e-05, + "loss": 0.5681, + "step": 433 + }, + { + "epoch": 0.45925925925925926, + "grad_norm": 0.3556795364307751, + "learning_rate": 4.7079576636613094e-05, + "loss": 0.616, + "step": 434 + }, + { + "epoch": 0.4603174603174603, + "grad_norm": 0.3255431792109049, + "learning_rate": 4.705997647981184e-05, + "loss": 0.6155, + "step": 435 + }, + { + "epoch": 0.4613756613756614, + "grad_norm": 0.3621995346427838, + "learning_rate": 4.7040376323010585e-05, + "loss": 0.5851, + "step": 436 + }, + { + "epoch": 0.4624338624338624, + "grad_norm": 0.31078411014873397, + "learning_rate": 4.702077616620933e-05, + "loss": 0.5034, + "step": 437 + }, + { + "epoch": 0.4634920634920635, + "grad_norm": 0.27503733967112304, + "learning_rate": 4.700117600940808e-05, + "loss": 0.5851, + "step": 438 + }, + { + "epoch": 0.46455026455026455, + "grad_norm": 0.34224618740297685, + "learning_rate": 4.6981575852606826e-05, + "loss": 0.6285, + "step": 439 + }, + { + "epoch": 0.4656084656084656, + "grad_norm": 0.27425336801037914, + "learning_rate": 4.696197569580557e-05, + "loss": 0.5225, + "step": 440 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.3603257769316038, + "learning_rate": 4.694237553900431e-05, + "loss": 0.5921, + "step": 441 + }, + { + "epoch": 0.4677248677248677, + "grad_norm": 0.32429639783867814, + "learning_rate": 4.692277538220306e-05, + "loss": 0.6941, + "step": 442 + }, + { + "epoch": 0.4687830687830688, + "grad_norm": 0.34481993099571034, + "learning_rate": 4.690317522540181e-05, + "loss": 0.5203, + "step": 443 + }, + { + "epoch": 0.46984126984126984, + "grad_norm": 0.3193736572025401, + "learning_rate": 4.688357506860055e-05, + "loss": 0.6302, + "step": 444 + }, + { + "epoch": 0.4708994708994709, + "grad_norm": 1.6187717657921294, + "learning_rate": 4.6863974911799295e-05, + "loss": 0.6413, + "step": 445 + }, + { + "epoch": 0.47195767195767196, + "grad_norm": 0.45961110444236447, + "learning_rate": 4.6844374754998044e-05, + "loss": 0.6542, + "step": 446 + }, + { + "epoch": 0.473015873015873, + "grad_norm": 0.34842912443778445, + "learning_rate": 4.682477459819679e-05, + "loss": 0.5353, + "step": 447 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 0.6192864098610441, + "learning_rate": 4.6805174441395536e-05, + "loss": 0.4736, + "step": 448 + }, + { + "epoch": 0.47513227513227513, + "grad_norm": 0.43463315211498527, + "learning_rate": 4.678557428459428e-05, + "loss": 0.6634, + "step": 449 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.4140965336433174, + "learning_rate": 4.676597412779302e-05, + "loss": 0.6842, + "step": 450 + }, + { + "epoch": 0.47724867724867726, + "grad_norm": 0.3907863076094101, + "learning_rate": 4.674637397099177e-05, + "loss": 0.5236, + "step": 451 + }, + { + "epoch": 0.4783068783068783, + "grad_norm": 0.3553573532226626, + "learning_rate": 4.672677381419052e-05, + "loss": 0.6322, + "step": 452 + }, + { + "epoch": 0.4793650793650794, + "grad_norm": 0.32855146366281446, + "learning_rate": 4.670717365738926e-05, + "loss": 0.5676, + "step": 453 + }, + { + "epoch": 0.4804232804232804, + "grad_norm": 0.3355219316367844, + "learning_rate": 4.6687573500588004e-05, + "loss": 0.6183, + "step": 454 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.33915222521124233, + "learning_rate": 4.6667973343786754e-05, + "loss": 0.5957, + "step": 455 + }, + { + "epoch": 0.48253968253968255, + "grad_norm": 0.2999900043336735, + "learning_rate": 4.66483731869855e-05, + "loss": 0.6239, + "step": 456 + }, + { + "epoch": 0.4835978835978836, + "grad_norm": 1.932322840844984, + "learning_rate": 4.662877303018424e-05, + "loss": 0.6079, + "step": 457 + }, + { + "epoch": 0.4846560846560847, + "grad_norm": 0.3841930370997371, + "learning_rate": 4.660917287338299e-05, + "loss": 0.5339, + "step": 458 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.29299834519404916, + "learning_rate": 4.658957271658174e-05, + "loss": 0.5155, + "step": 459 + }, + { + "epoch": 0.48677248677248675, + "grad_norm": 0.3508678158463962, + "learning_rate": 4.656997255978048e-05, + "loss": 0.5569, + "step": 460 + }, + { + "epoch": 0.48783068783068784, + "grad_norm": 0.370161022664498, + "learning_rate": 4.655037240297922e-05, + "loss": 0.6227, + "step": 461 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.336334840849176, + "learning_rate": 4.653077224617797e-05, + "loss": 0.5896, + "step": 462 + }, + { + "epoch": 0.48994708994708996, + "grad_norm": 0.3387385164468496, + "learning_rate": 4.651117208937672e-05, + "loss": 0.5575, + "step": 463 + }, + { + "epoch": 0.491005291005291, + "grad_norm": 0.29356754801120044, + "learning_rate": 4.649157193257546e-05, + "loss": 0.5692, + "step": 464 + }, + { + "epoch": 0.49206349206349204, + "grad_norm": 0.32417509452640764, + "learning_rate": 4.647197177577421e-05, + "loss": 0.542, + "step": 465 + }, + { + "epoch": 0.4931216931216931, + "grad_norm": 0.26738840774433187, + "learning_rate": 4.6452371618972955e-05, + "loss": 0.5007, + "step": 466 + }, + { + "epoch": 0.49417989417989416, + "grad_norm": 0.3639897702647601, + "learning_rate": 4.64327714621717e-05, + "loss": 0.5834, + "step": 467 + }, + { + "epoch": 0.49523809523809526, + "grad_norm": 7.352056250865767, + "learning_rate": 4.641317130537045e-05, + "loss": 0.7152, + "step": 468 + }, + { + "epoch": 0.4962962962962963, + "grad_norm": 0.35916204859142065, + "learning_rate": 4.6393571148569196e-05, + "loss": 0.508, + "step": 469 + }, + { + "epoch": 0.4973544973544973, + "grad_norm": 1.1916607882832544, + "learning_rate": 4.637397099176793e-05, + "loss": 0.716, + "step": 470 + }, + { + "epoch": 0.4984126984126984, + "grad_norm": 0.2879923677665714, + "learning_rate": 4.635437083496668e-05, + "loss": 0.4927, + "step": 471 + }, + { + "epoch": 0.49947089947089945, + "grad_norm": 0.3327630140013385, + "learning_rate": 4.633477067816543e-05, + "loss": 0.567, + "step": 472 + }, + { + "epoch": 0.5005291005291005, + "grad_norm": 0.47964078282416894, + "learning_rate": 4.631517052136417e-05, + "loss": 0.5215, + "step": 473 + }, + { + "epoch": 0.5015873015873016, + "grad_norm": 0.9360332803048105, + "learning_rate": 4.6295570364562915e-05, + "loss": 0.5973, + "step": 474 + }, + { + "epoch": 0.5026455026455027, + "grad_norm": 0.39106581876004903, + "learning_rate": 4.6275970207761664e-05, + "loss": 0.5984, + "step": 475 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 1.6307210187055834, + "learning_rate": 4.6256370050960414e-05, + "loss": 0.557, + "step": 476 + }, + { + "epoch": 0.5047619047619047, + "grad_norm": 0.29838204099866084, + "learning_rate": 4.6236769894159156e-05, + "loss": 0.584, + "step": 477 + }, + { + "epoch": 0.5058201058201058, + "grad_norm": 0.3257371970902655, + "learning_rate": 4.62171697373579e-05, + "loss": 0.5222, + "step": 478 + }, + { + "epoch": 0.5068783068783069, + "grad_norm": 0.2892518749067453, + "learning_rate": 4.619756958055665e-05, + "loss": 0.6022, + "step": 479 + }, + { + "epoch": 0.5079365079365079, + "grad_norm": 0.3395163891295951, + "learning_rate": 4.617796942375539e-05, + "loss": 0.6507, + "step": 480 + }, + { + "epoch": 0.508994708994709, + "grad_norm": 0.29784589758599245, + "learning_rate": 4.615836926695414e-05, + "loss": 0.5763, + "step": 481 + }, + { + "epoch": 0.5100529100529101, + "grad_norm": 0.28702137011485396, + "learning_rate": 4.613876911015288e-05, + "loss": 0.5317, + "step": 482 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 1.920530223820589, + "learning_rate": 4.6119168953351625e-05, + "loss": 0.5852, + "step": 483 + }, + { + "epoch": 0.5121693121693122, + "grad_norm": 0.482240940409115, + "learning_rate": 4.6099568796550374e-05, + "loss": 0.5854, + "step": 484 + }, + { + "epoch": 0.5132275132275133, + "grad_norm": 0.39713425911160416, + "learning_rate": 4.607996863974912e-05, + "loss": 0.5813, + "step": 485 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 0.36739851508814947, + "learning_rate": 4.6060368482947866e-05, + "loss": 0.6217, + "step": 486 + }, + { + "epoch": 0.5153439153439153, + "grad_norm": 0.4261831012022294, + "learning_rate": 4.604076832614661e-05, + "loss": 0.577, + "step": 487 + }, + { + "epoch": 0.5164021164021164, + "grad_norm": 4.16824575872653, + "learning_rate": 4.602116816934536e-05, + "loss": 0.8018, + "step": 488 + }, + { + "epoch": 0.5174603174603175, + "grad_norm": 0.4804692579466171, + "learning_rate": 4.6001568012544107e-05, + "loss": 0.5141, + "step": 489 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.31014473785231145, + "learning_rate": 4.598196785574285e-05, + "loss": 0.5435, + "step": 490 + }, + { + "epoch": 0.5195767195767196, + "grad_norm": 1.2690098499967961, + "learning_rate": 4.596236769894159e-05, + "loss": 0.5384, + "step": 491 + }, + { + "epoch": 0.5206349206349207, + "grad_norm": 0.3480203912635704, + "learning_rate": 4.594276754214034e-05, + "loss": 0.5376, + "step": 492 + }, + { + "epoch": 0.5216931216931217, + "grad_norm": 0.35152029139562335, + "learning_rate": 4.592316738533908e-05, + "loss": 0.5837, + "step": 493 + }, + { + "epoch": 0.5227513227513227, + "grad_norm": 0.33161605035498015, + "learning_rate": 4.590356722853783e-05, + "loss": 0.5044, + "step": 494 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 0.36352262970279386, + "learning_rate": 4.5883967071736575e-05, + "loss": 0.5796, + "step": 495 + }, + { + "epoch": 0.5248677248677248, + "grad_norm": 0.3237400201076874, + "learning_rate": 4.586436691493532e-05, + "loss": 0.5769, + "step": 496 + }, + { + "epoch": 0.5259259259259259, + "grad_norm": 2.9866473245279956, + "learning_rate": 4.584476675813407e-05, + "loss": 0.7518, + "step": 497 + }, + { + "epoch": 0.526984126984127, + "grad_norm": 0.6522215400738879, + "learning_rate": 4.5825166601332816e-05, + "loss": 0.5486, + "step": 498 + }, + { + "epoch": 0.5280423280423281, + "grad_norm": 0.42283674583260256, + "learning_rate": 4.580556644453156e-05, + "loss": 0.5667, + "step": 499 + }, + { + "epoch": 0.5291005291005291, + "grad_norm": 0.584791999049459, + "learning_rate": 4.57859662877303e-05, + "loss": 0.5344, + "step": 500 + }, + { + "epoch": 0.5301587301587302, + "grad_norm": 0.40242508358942275, + "learning_rate": 4.576636613092905e-05, + "loss": 0.5313, + "step": 501 + }, + { + "epoch": 0.5312169312169313, + "grad_norm": 0.745312348825906, + "learning_rate": 4.57467659741278e-05, + "loss": 0.5376, + "step": 502 + }, + { + "epoch": 0.5322751322751322, + "grad_norm": 0.49986791711058115, + "learning_rate": 4.572716581732654e-05, + "loss": 0.6753, + "step": 503 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.42496269751477106, + "learning_rate": 4.5707565660525285e-05, + "loss": 0.5841, + "step": 504 + }, + { + "epoch": 0.5343915343915344, + "grad_norm": 0.4181393322777193, + "learning_rate": 4.5687965503724034e-05, + "loss": 0.5555, + "step": 505 + }, + { + "epoch": 0.5354497354497354, + "grad_norm": 0.3771184252890316, + "learning_rate": 4.5668365346922776e-05, + "loss": 0.5982, + "step": 506 + }, + { + "epoch": 0.5365079365079365, + "grad_norm": 0.47820059656385455, + "learning_rate": 4.5648765190121526e-05, + "loss": 0.633, + "step": 507 + }, + { + "epoch": 0.5375661375661376, + "grad_norm": 0.31449523301220705, + "learning_rate": 4.562916503332027e-05, + "loss": 0.5999, + "step": 508 + }, + { + "epoch": 0.5386243386243387, + "grad_norm": 5.245441857198801, + "learning_rate": 4.560956487651901e-05, + "loss": 0.6226, + "step": 509 + }, + { + "epoch": 0.5396825396825397, + "grad_norm": 0.64012171457468, + "learning_rate": 4.558996471971776e-05, + "loss": 0.6185, + "step": 510 + }, + { + "epoch": 0.5407407407407407, + "grad_norm": 0.354568866388443, + "learning_rate": 4.557036456291651e-05, + "loss": 0.6311, + "step": 511 + }, + { + "epoch": 0.5417989417989418, + "grad_norm": 1.9487919605291462, + "learning_rate": 4.555076440611525e-05, + "loss": 0.5987, + "step": 512 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 0.6933984738129275, + "learning_rate": 4.5531164249313994e-05, + "loss": 0.6377, + "step": 513 + }, + { + "epoch": 0.5439153439153439, + "grad_norm": 0.4755759149794088, + "learning_rate": 4.551156409251274e-05, + "loss": 0.5982, + "step": 514 + }, + { + "epoch": 0.544973544973545, + "grad_norm": 0.650297043746455, + "learning_rate": 4.549196393571149e-05, + "loss": 0.5998, + "step": 515 + }, + { + "epoch": 0.546031746031746, + "grad_norm": 0.5279264918555835, + "learning_rate": 4.5472363778910235e-05, + "loss": 0.5869, + "step": 516 + }, + { + "epoch": 0.5470899470899471, + "grad_norm": 0.6614038657390391, + "learning_rate": 4.545276362210898e-05, + "loss": 0.5718, + "step": 517 + }, + { + "epoch": 0.5481481481481482, + "grad_norm": 0.4181458689972182, + "learning_rate": 4.543316346530773e-05, + "loss": 0.5577, + "step": 518 + }, + { + "epoch": 0.5492063492063493, + "grad_norm": 4.552268896504232, + "learning_rate": 4.541356330850647e-05, + "loss": 0.7083, + "step": 519 + }, + { + "epoch": 0.5502645502645502, + "grad_norm": 33.061004547207546, + "learning_rate": 4.539396315170522e-05, + "loss": 0.8391, + "step": 520 + }, + { + "epoch": 0.5513227513227513, + "grad_norm": 1.2664930712290081, + "learning_rate": 4.537436299490396e-05, + "loss": 0.5705, + "step": 521 + }, + { + "epoch": 0.5523809523809524, + "grad_norm": 0.42354787993108955, + "learning_rate": 4.5354762838102704e-05, + "loss": 0.5052, + "step": 522 + }, + { + "epoch": 0.5534391534391534, + "grad_norm": 0.8388212731855088, + "learning_rate": 4.533516268130145e-05, + "loss": 0.5349, + "step": 523 + }, + { + "epoch": 0.5544973544973545, + "grad_norm": 0.7949298334624874, + "learning_rate": 4.53155625245002e-05, + "loss": 0.5497, + "step": 524 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.44871889970887846, + "learning_rate": 4.5295962367698945e-05, + "loss": 0.5978, + "step": 525 + }, + { + "epoch": 0.5566137566137566, + "grad_norm": 0.7676280541833576, + "learning_rate": 4.527636221089769e-05, + "loss": 0.551, + "step": 526 + }, + { + "epoch": 0.5576719576719577, + "grad_norm": 0.7002659965437515, + "learning_rate": 4.5256762054096436e-05, + "loss": 0.5808, + "step": 527 + }, + { + "epoch": 0.5587301587301587, + "grad_norm": 1.2074663454141952, + "learning_rate": 4.5237161897295186e-05, + "loss": 0.6605, + "step": 528 + }, + { + "epoch": 0.5597883597883598, + "grad_norm": 0.712483552007316, + "learning_rate": 4.521756174049392e-05, + "loss": 0.618, + "step": 529 + }, + { + "epoch": 0.5608465608465608, + "grad_norm": 0.331920911665764, + "learning_rate": 4.519796158369267e-05, + "loss": 0.5375, + "step": 530 + }, + { + "epoch": 0.5619047619047619, + "grad_norm": 0.598217747836947, + "learning_rate": 4.517836142689142e-05, + "loss": 0.5801, + "step": 531 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 0.39707437203429746, + "learning_rate": 4.515876127009016e-05, + "loss": 0.5298, + "step": 532 + }, + { + "epoch": 0.564021164021164, + "grad_norm": 0.33682693857579143, + "learning_rate": 4.5139161113288905e-05, + "loss": 0.5137, + "step": 533 + }, + { + "epoch": 0.5650793650793651, + "grad_norm": 0.48164899152667084, + "learning_rate": 4.5119560956487654e-05, + "loss": 0.5781, + "step": 534 + }, + { + "epoch": 0.5661375661375662, + "grad_norm": 0.3219468021100436, + "learning_rate": 4.5099960799686397e-05, + "loss": 0.4857, + "step": 535 + }, + { + "epoch": 0.5671957671957671, + "grad_norm": 0.3320548497953154, + "learning_rate": 4.5080360642885146e-05, + "loss": 0.5725, + "step": 536 + }, + { + "epoch": 0.5682539682539682, + "grad_norm": 0.4483913400362615, + "learning_rate": 4.5060760486083895e-05, + "loss": 0.4854, + "step": 537 + }, + { + "epoch": 0.5693121693121693, + "grad_norm": 0.36430948857681866, + "learning_rate": 4.504116032928264e-05, + "loss": 0.662, + "step": 538 + }, + { + "epoch": 0.5703703703703704, + "grad_norm": 0.342825639833936, + "learning_rate": 4.502156017248138e-05, + "loss": 0.5921, + "step": 539 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3734526413757713, + "learning_rate": 4.500196001568013e-05, + "loss": 0.5297, + "step": 540 + }, + { + "epoch": 0.5724867724867725, + "grad_norm": 0.3388581653822819, + "learning_rate": 4.498235985887888e-05, + "loss": 0.6147, + "step": 541 + }, + { + "epoch": 0.5735449735449736, + "grad_norm": 0.3205452586395781, + "learning_rate": 4.4962759702077614e-05, + "loss": 0.5175, + "step": 542 + }, + { + "epoch": 0.5746031746031746, + "grad_norm": 0.34747287406127814, + "learning_rate": 4.4943159545276363e-05, + "loss": 0.588, + "step": 543 + }, + { + "epoch": 0.5756613756613757, + "grad_norm": 0.3293870457786716, + "learning_rate": 4.492355938847511e-05, + "loss": 0.6207, + "step": 544 + }, + { + "epoch": 0.5767195767195767, + "grad_norm": 1.5911037019280505, + "learning_rate": 4.4903959231673855e-05, + "loss": 0.6216, + "step": 545 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.2970675826508983, + "learning_rate": 4.48843590748726e-05, + "loss": 0.543, + "step": 546 + }, + { + "epoch": 0.5788359788359788, + "grad_norm": 0.3347900655603031, + "learning_rate": 4.486475891807135e-05, + "loss": 0.5524, + "step": 547 + }, + { + "epoch": 0.5798941798941799, + "grad_norm": 0.3297134029351915, + "learning_rate": 4.484515876127009e-05, + "loss": 0.5289, + "step": 548 + }, + { + "epoch": 0.580952380952381, + "grad_norm": 0.33203980488440166, + "learning_rate": 4.482555860446884e-05, + "loss": 0.5628, + "step": 549 + }, + { + "epoch": 0.582010582010582, + "grad_norm": 0.3675901822117606, + "learning_rate": 4.480595844766758e-05, + "loss": 0.6431, + "step": 550 + }, + { + "epoch": 0.5830687830687831, + "grad_norm": 8.35298256158783, + "learning_rate": 4.478635829086633e-05, + "loss": 0.6803, + "step": 551 + }, + { + "epoch": 0.5841269841269842, + "grad_norm": 0.38944467670316385, + "learning_rate": 4.476675813406507e-05, + "loss": 0.6053, + "step": 552 + }, + { + "epoch": 0.5851851851851851, + "grad_norm": 1.8390547635610588, + "learning_rate": 4.474715797726382e-05, + "loss": 0.5739, + "step": 553 + }, + { + "epoch": 0.5862433862433862, + "grad_norm": 0.3840415785962572, + "learning_rate": 4.4727557820462565e-05, + "loss": 0.6275, + "step": 554 + }, + { + "epoch": 0.5873015873015873, + "grad_norm": 0.29418089397003, + "learning_rate": 4.470795766366131e-05, + "loss": 0.535, + "step": 555 + }, + { + "epoch": 0.5883597883597883, + "grad_norm": 0.3736504054747309, + "learning_rate": 4.4688357506860056e-05, + "loss": 0.5943, + "step": 556 + }, + { + "epoch": 0.5894179894179894, + "grad_norm": 0.33285236147227376, + "learning_rate": 4.4668757350058806e-05, + "loss": 0.5586, + "step": 557 + }, + { + "epoch": 0.5904761904761905, + "grad_norm": 0.2903484444203814, + "learning_rate": 4.464915719325755e-05, + "loss": 0.5365, + "step": 558 + }, + { + "epoch": 0.5915343915343916, + "grad_norm": 5.46404606246135, + "learning_rate": 4.462955703645629e-05, + "loss": 0.6439, + "step": 559 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.4599968468845947, + "learning_rate": 4.460995687965504e-05, + "loss": 0.6976, + "step": 560 + }, + { + "epoch": 0.5936507936507937, + "grad_norm": 0.2982395341967786, + "learning_rate": 4.459035672285379e-05, + "loss": 0.5041, + "step": 561 + }, + { + "epoch": 0.5947089947089947, + "grad_norm": 0.30282795685306324, + "learning_rate": 4.457075656605253e-05, + "loss": 0.5909, + "step": 562 + }, + { + "epoch": 0.5957671957671957, + "grad_norm": 0.39972890415638074, + "learning_rate": 4.4551156409251274e-05, + "loss": 0.6235, + "step": 563 + }, + { + "epoch": 0.5968253968253968, + "grad_norm": 0.2873678600174425, + "learning_rate": 4.4531556252450023e-05, + "loss": 0.5421, + "step": 564 + }, + { + "epoch": 0.5978835978835979, + "grad_norm": 0.3559983769537934, + "learning_rate": 4.4511956095648766e-05, + "loss": 0.5561, + "step": 565 + }, + { + "epoch": 0.5989417989417989, + "grad_norm": 0.3147975278256888, + "learning_rate": 4.4492355938847515e-05, + "loss": 0.4711, + "step": 566 + }, + { + "epoch": 0.6, + "grad_norm": 0.2607641173456458, + "learning_rate": 4.447275578204626e-05, + "loss": 0.5823, + "step": 567 + }, + { + "epoch": 0.6010582010582011, + "grad_norm": 0.2926523078468275, + "learning_rate": 4.4453155625245e-05, + "loss": 0.5406, + "step": 568 + }, + { + "epoch": 0.6021164021164022, + "grad_norm": 0.3452463175268165, + "learning_rate": 4.443355546844375e-05, + "loss": 0.5894, + "step": 569 + }, + { + "epoch": 0.6031746031746031, + "grad_norm": 0.24630431433470665, + "learning_rate": 4.44139553116425e-05, + "loss": 0.517, + "step": 570 + }, + { + "epoch": 0.6042328042328042, + "grad_norm": 0.3401893647581997, + "learning_rate": 4.439435515484124e-05, + "loss": 0.5647, + "step": 571 + }, + { + "epoch": 0.6052910052910053, + "grad_norm": 0.3208322781574765, + "learning_rate": 4.4374754998039984e-05, + "loss": 0.5116, + "step": 572 + }, + { + "epoch": 0.6063492063492063, + "grad_norm": 0.28383141803835954, + "learning_rate": 4.435515484123873e-05, + "loss": 0.5441, + "step": 573 + }, + { + "epoch": 0.6074074074074074, + "grad_norm": 0.34262017652008814, + "learning_rate": 4.433555468443748e-05, + "loss": 0.5972, + "step": 574 + }, + { + "epoch": 0.6084656084656085, + "grad_norm": 0.31557046384544746, + "learning_rate": 4.4315954527636225e-05, + "loss": 0.6478, + "step": 575 + }, + { + "epoch": 0.6095238095238096, + "grad_norm": 0.31340622501168497, + "learning_rate": 4.429635437083497e-05, + "loss": 0.561, + "step": 576 + }, + { + "epoch": 0.6105820105820106, + "grad_norm": 0.27478250517189123, + "learning_rate": 4.4276754214033716e-05, + "loss": 0.6372, + "step": 577 + }, + { + "epoch": 0.6116402116402117, + "grad_norm": 0.3096835013791637, + "learning_rate": 4.425715405723246e-05, + "loss": 0.5873, + "step": 578 + }, + { + "epoch": 0.6126984126984127, + "grad_norm": 0.29052274812803913, + "learning_rate": 4.423755390043121e-05, + "loss": 0.6673, + "step": 579 + }, + { + "epoch": 0.6137566137566137, + "grad_norm": 0.2726481497564029, + "learning_rate": 4.421795374362995e-05, + "loss": 0.5688, + "step": 580 + }, + { + "epoch": 0.6148148148148148, + "grad_norm": 0.30868513597771813, + "learning_rate": 4.419835358682869e-05, + "loss": 0.5413, + "step": 581 + }, + { + "epoch": 0.6158730158730159, + "grad_norm": 0.2931107306494861, + "learning_rate": 4.417875343002744e-05, + "loss": 0.6191, + "step": 582 + }, + { + "epoch": 0.6169312169312169, + "grad_norm": 0.27359852790549766, + "learning_rate": 4.415915327322619e-05, + "loss": 0.5976, + "step": 583 + }, + { + "epoch": 0.617989417989418, + "grad_norm": 0.300451446217763, + "learning_rate": 4.4139553116424934e-05, + "loss": 0.5958, + "step": 584 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.2739263908226725, + "learning_rate": 4.411995295962368e-05, + "loss": 0.4819, + "step": 585 + }, + { + "epoch": 0.6201058201058202, + "grad_norm": 0.3629370235736209, + "learning_rate": 4.4100352802822426e-05, + "loss": 0.5969, + "step": 586 + }, + { + "epoch": 0.6211640211640211, + "grad_norm": 0.2546483909460872, + "learning_rate": 4.4080752646021175e-05, + "loss": 0.5637, + "step": 587 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.2900274901311552, + "learning_rate": 4.406115248921992e-05, + "loss": 0.5981, + "step": 588 + }, + { + "epoch": 0.6232804232804233, + "grad_norm": 0.30997705577212653, + "learning_rate": 4.404155233241866e-05, + "loss": 0.5763, + "step": 589 + }, + { + "epoch": 0.6243386243386243, + "grad_norm": 0.27235932084700465, + "learning_rate": 4.402195217561741e-05, + "loss": 0.6027, + "step": 590 + }, + { + "epoch": 0.6253968253968254, + "grad_norm": 0.27424029005807476, + "learning_rate": 4.400235201881615e-05, + "loss": 0.528, + "step": 591 + }, + { + "epoch": 0.6264550264550265, + "grad_norm": 0.8024320046005814, + "learning_rate": 4.39827518620149e-05, + "loss": 0.5073, + "step": 592 + }, + { + "epoch": 0.6275132275132275, + "grad_norm": 0.25330323543588595, + "learning_rate": 4.3963151705213644e-05, + "loss": 0.5903, + "step": 593 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.2786349566048451, + "learning_rate": 4.3943551548412386e-05, + "loss": 0.5646, + "step": 594 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.27111077124213456, + "learning_rate": 4.3923951391611135e-05, + "loss": 0.5166, + "step": 595 + }, + { + "epoch": 0.6306878306878307, + "grad_norm": 0.2537581235144252, + "learning_rate": 4.3904351234809885e-05, + "loss": 0.5475, + "step": 596 + }, + { + "epoch": 0.6317460317460317, + "grad_norm": 0.2940667859193826, + "learning_rate": 4.388475107800862e-05, + "loss": 0.5437, + "step": 597 + }, + { + "epoch": 0.6328042328042328, + "grad_norm": 0.26691327126042796, + "learning_rate": 4.386515092120737e-05, + "loss": 0.5508, + "step": 598 + }, + { + "epoch": 0.6338624338624339, + "grad_norm": 3.125987633190478, + "learning_rate": 4.384555076440612e-05, + "loss": 0.5517, + "step": 599 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.3051459051560689, + "learning_rate": 4.382595060760487e-05, + "loss": 0.5032, + "step": 600 + }, + { + "epoch": 0.635978835978836, + "grad_norm": 0.2820838021225074, + "learning_rate": 4.3806350450803604e-05, + "loss": 0.5407, + "step": 601 + }, + { + "epoch": 0.6370370370370371, + "grad_norm": 1.3393856684804768, + "learning_rate": 4.378675029400235e-05, + "loss": 0.6108, + "step": 602 + }, + { + "epoch": 0.638095238095238, + "grad_norm": 0.3228594095981327, + "learning_rate": 4.37671501372011e-05, + "loss": 0.5025, + "step": 603 + }, + { + "epoch": 0.6391534391534391, + "grad_norm": 0.3351209221453693, + "learning_rate": 4.3747549980399845e-05, + "loss": 0.5659, + "step": 604 + }, + { + "epoch": 0.6402116402116402, + "grad_norm": 0.31230131092186864, + "learning_rate": 4.372794982359859e-05, + "loss": 0.6007, + "step": 605 + }, + { + "epoch": 0.6412698412698413, + "grad_norm": 0.2828867288582573, + "learning_rate": 4.370834966679734e-05, + "loss": 0.527, + "step": 606 + }, + { + "epoch": 0.6423280423280423, + "grad_norm": 0.34529541782733586, + "learning_rate": 4.368874950999608e-05, + "loss": 0.5486, + "step": 607 + }, + { + "epoch": 0.6433862433862434, + "grad_norm": 0.2951068794541986, + "learning_rate": 4.366914935319483e-05, + "loss": 0.5267, + "step": 608 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.40815008679176695, + "learning_rate": 4.364954919639358e-05, + "loss": 0.5177, + "step": 609 + }, + { + "epoch": 0.6455026455026455, + "grad_norm": 0.32045696467121443, + "learning_rate": 4.362994903959231e-05, + "loss": 0.5586, + "step": 610 + }, + { + "epoch": 0.6465608465608466, + "grad_norm": 0.32938732342590915, + "learning_rate": 4.361034888279106e-05, + "loss": 0.5711, + "step": 611 + }, + { + "epoch": 0.6476190476190476, + "grad_norm": 0.2914985462165065, + "learning_rate": 4.359074872598981e-05, + "loss": 0.6356, + "step": 612 + }, + { + "epoch": 0.6486772486772486, + "grad_norm": 0.31957339334532875, + "learning_rate": 4.357114856918856e-05, + "loss": 0.5775, + "step": 613 + }, + { + "epoch": 0.6497354497354497, + "grad_norm": 0.3380048318234734, + "learning_rate": 4.35515484123873e-05, + "loss": 0.5668, + "step": 614 + }, + { + "epoch": 0.6507936507936508, + "grad_norm": 0.2677052222743914, + "learning_rate": 4.3531948255586046e-05, + "loss": 0.5867, + "step": 615 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 0.2714050773104208, + "learning_rate": 4.3512348098784795e-05, + "loss": 0.569, + "step": 616 + }, + { + "epoch": 0.6529100529100529, + "grad_norm": 0.322108580723527, + "learning_rate": 4.349274794198354e-05, + "loss": 0.5379, + "step": 617 + }, + { + "epoch": 0.653968253968254, + "grad_norm": 0.2569245838676028, + "learning_rate": 4.347314778518228e-05, + "loss": 0.5118, + "step": 618 + }, + { + "epoch": 0.6550264550264551, + "grad_norm": 0.2530435498061503, + "learning_rate": 4.345354762838103e-05, + "loss": 0.5139, + "step": 619 + }, + { + "epoch": 0.656084656084656, + "grad_norm": 0.29585162017581107, + "learning_rate": 4.343394747157977e-05, + "loss": 0.4954, + "step": 620 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 0.2924074793370025, + "learning_rate": 4.341434731477852e-05, + "loss": 0.6042, + "step": 621 + }, + { + "epoch": 0.6582010582010582, + "grad_norm": 0.2855302831026707, + "learning_rate": 4.3394747157977264e-05, + "loss": 0.514, + "step": 622 + }, + { + "epoch": 0.6592592592592592, + "grad_norm": 0.7366169291367959, + "learning_rate": 4.337514700117601e-05, + "loss": 0.4895, + "step": 623 + }, + { + "epoch": 0.6603174603174603, + "grad_norm": 0.27063474014038635, + "learning_rate": 4.3355546844374756e-05, + "loss": 0.5592, + "step": 624 + }, + { + "epoch": 0.6613756613756614, + "grad_norm": 0.2539651508587892, + "learning_rate": 4.3335946687573505e-05, + "loss": 0.5301, + "step": 625 + }, + { + "epoch": 0.6624338624338625, + "grad_norm": 0.2458381741039525, + "learning_rate": 4.331634653077225e-05, + "loss": 0.5762, + "step": 626 + }, + { + "epoch": 0.6634920634920635, + "grad_norm": 0.25275919533490976, + "learning_rate": 4.329674637397099e-05, + "loss": 0.5422, + "step": 627 + }, + { + "epoch": 0.6645502645502646, + "grad_norm": 0.2626626352002252, + "learning_rate": 4.327714621716974e-05, + "loss": 0.5175, + "step": 628 + }, + { + "epoch": 0.6656084656084656, + "grad_norm": 0.2646967802621259, + "learning_rate": 4.325754606036849e-05, + "loss": 0.4619, + "step": 629 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.24368769535748608, + "learning_rate": 4.323794590356723e-05, + "loss": 0.5964, + "step": 630 + }, + { + "epoch": 0.6677248677248677, + "grad_norm": 0.2959718652590851, + "learning_rate": 4.321834574676597e-05, + "loss": 0.5691, + "step": 631 + }, + { + "epoch": 0.6687830687830688, + "grad_norm": 0.2871293046686386, + "learning_rate": 4.319874558996472e-05, + "loss": 0.5696, + "step": 632 + }, + { + "epoch": 0.6698412698412698, + "grad_norm": 0.2905598264986975, + "learning_rate": 4.3179145433163465e-05, + "loss": 0.629, + "step": 633 + }, + { + "epoch": 0.6708994708994709, + "grad_norm": 0.2528097775183016, + "learning_rate": 4.3159545276362214e-05, + "loss": 0.4849, + "step": 634 + }, + { + "epoch": 0.671957671957672, + "grad_norm": 0.33621938773228494, + "learning_rate": 4.313994511956096e-05, + "loss": 0.5879, + "step": 635 + }, + { + "epoch": 0.6730158730158731, + "grad_norm": 0.2394966771033253, + "learning_rate": 4.3120344962759706e-05, + "loss": 0.439, + "step": 636 + }, + { + "epoch": 0.674074074074074, + "grad_norm": 0.3150605848513219, + "learning_rate": 4.310074480595845e-05, + "loss": 0.5989, + "step": 637 + }, + { + "epoch": 0.6751322751322751, + "grad_norm": 0.2981707132882546, + "learning_rate": 4.30811446491572e-05, + "loss": 0.6225, + "step": 638 + }, + { + "epoch": 0.6761904761904762, + "grad_norm": 0.30492112978736213, + "learning_rate": 4.306154449235594e-05, + "loss": 0.5404, + "step": 639 + }, + { + "epoch": 0.6772486772486772, + "grad_norm": 0.2814127526303401, + "learning_rate": 4.304194433555468e-05, + "loss": 0.5233, + "step": 640 + }, + { + "epoch": 0.6783068783068783, + "grad_norm": 0.30173716166671577, + "learning_rate": 4.302234417875343e-05, + "loss": 0.6427, + "step": 641 + }, + { + "epoch": 0.6793650793650794, + "grad_norm": 0.25137410162980395, + "learning_rate": 4.300274402195218e-05, + "loss": 0.5607, + "step": 642 + }, + { + "epoch": 0.6804232804232804, + "grad_norm": 0.30267125046888244, + "learning_rate": 4.2983143865150924e-05, + "loss": 0.6308, + "step": 643 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 0.2570686438542522, + "learning_rate": 4.2963543708349666e-05, + "loss": 0.5708, + "step": 644 + }, + { + "epoch": 0.6825396825396826, + "grad_norm": 0.47196520519510343, + "learning_rate": 4.2943943551548416e-05, + "loss": 0.4772, + "step": 645 + }, + { + "epoch": 0.6835978835978836, + "grad_norm": 0.2733048973472466, + "learning_rate": 4.292434339474716e-05, + "loss": 0.5569, + "step": 646 + }, + { + "epoch": 0.6846560846560846, + "grad_norm": 0.29886485226274184, + "learning_rate": 4.290474323794591e-05, + "loss": 0.5785, + "step": 647 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2403402937990743, + "learning_rate": 4.288514308114465e-05, + "loss": 0.5595, + "step": 648 + }, + { + "epoch": 0.6867724867724868, + "grad_norm": 0.2794008545700776, + "learning_rate": 4.28655429243434e-05, + "loss": 0.5396, + "step": 649 + }, + { + "epoch": 0.6878306878306878, + "grad_norm": 0.3041316437038585, + "learning_rate": 4.284594276754214e-05, + "loss": 0.5601, + "step": 650 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.24855481366370547, + "learning_rate": 4.282634261074089e-05, + "loss": 0.5405, + "step": 651 + }, + { + "epoch": 0.68994708994709, + "grad_norm": 0.3040097458165702, + "learning_rate": 4.280674245393963e-05, + "loss": 0.64, + "step": 652 + }, + { + "epoch": 0.691005291005291, + "grad_norm": 0.3069869635860471, + "learning_rate": 4.2787142297138376e-05, + "loss": 0.4583, + "step": 653 + }, + { + "epoch": 0.692063492063492, + "grad_norm": 0.2936008162342032, + "learning_rate": 4.2767542140337125e-05, + "loss": 0.5027, + "step": 654 + }, + { + "epoch": 0.6931216931216931, + "grad_norm": 0.25492661711077735, + "learning_rate": 4.2747941983535874e-05, + "loss": 0.5225, + "step": 655 + }, + { + "epoch": 0.6941798941798942, + "grad_norm": 2.5144220055291275, + "learning_rate": 4.272834182673462e-05, + "loss": 0.6846, + "step": 656 + }, + { + "epoch": 0.6952380952380952, + "grad_norm": 0.28990557153471347, + "learning_rate": 4.270874166993336e-05, + "loss": 0.5752, + "step": 657 + }, + { + "epoch": 0.6962962962962963, + "grad_norm": 0.26542534568407355, + "learning_rate": 4.268914151313211e-05, + "loss": 0.4701, + "step": 658 + }, + { + "epoch": 0.6973544973544974, + "grad_norm": 0.3193855535773065, + "learning_rate": 4.266954135633085e-05, + "loss": 0.4586, + "step": 659 + }, + { + "epoch": 0.6984126984126984, + "grad_norm": 0.27477559170920174, + "learning_rate": 4.26499411995296e-05, + "loss": 0.5044, + "step": 660 + }, + { + "epoch": 0.6994708994708995, + "grad_norm": 0.2691283100685374, + "learning_rate": 4.263034104272834e-05, + "loss": 0.6087, + "step": 661 + }, + { + "epoch": 0.7005291005291006, + "grad_norm": 0.3830270820991914, + "learning_rate": 4.261074088592709e-05, + "loss": 0.5775, + "step": 662 + }, + { + "epoch": 0.7015873015873015, + "grad_norm": 0.2752894226111952, + "learning_rate": 4.2591140729125835e-05, + "loss": 0.5079, + "step": 663 + }, + { + "epoch": 0.7026455026455026, + "grad_norm": 0.28397687271151834, + "learning_rate": 4.2571540572324584e-05, + "loss": 0.5804, + "step": 664 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 3.367695352363058, + "learning_rate": 4.2551940415523326e-05, + "loss": 0.7196, + "step": 665 + }, + { + "epoch": 0.7047619047619048, + "grad_norm": 0.38941063877467563, + "learning_rate": 4.253234025872207e-05, + "loss": 0.5676, + "step": 666 + }, + { + "epoch": 0.7058201058201058, + "grad_norm": 0.299525285575676, + "learning_rate": 4.251274010192082e-05, + "loss": 0.5809, + "step": 667 + }, + { + "epoch": 0.7068783068783069, + "grad_norm": 0.24410082905969419, + "learning_rate": 4.249313994511957e-05, + "loss": 0.5784, + "step": 668 + }, + { + "epoch": 0.707936507936508, + "grad_norm": 0.817819426083752, + "learning_rate": 4.24735397883183e-05, + "loss": 0.5015, + "step": 669 + }, + { + "epoch": 0.708994708994709, + "grad_norm": 0.35760232367472883, + "learning_rate": 4.245393963151705e-05, + "loss": 0.6156, + "step": 670 + }, + { + "epoch": 0.71005291005291, + "grad_norm": 0.2827406205547459, + "learning_rate": 4.24343394747158e-05, + "loss": 0.5757, + "step": 671 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.28008993326048687, + "learning_rate": 4.241473931791455e-05, + "loss": 0.5112, + "step": 672 + }, + { + "epoch": 0.7121693121693121, + "grad_norm": 0.2924001411666532, + "learning_rate": 4.2395139161113287e-05, + "loss": 0.5523, + "step": 673 + }, + { + "epoch": 0.7132275132275132, + "grad_norm": 0.27305443841057714, + "learning_rate": 4.2375539004312036e-05, + "loss": 0.5126, + "step": 674 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.2736723087303262, + "learning_rate": 4.2355938847510785e-05, + "loss": 0.5388, + "step": 675 + }, + { + "epoch": 0.7153439153439154, + "grad_norm": 0.27719351183152063, + "learning_rate": 4.233633869070953e-05, + "loss": 0.5331, + "step": 676 + }, + { + "epoch": 0.7164021164021164, + "grad_norm": 0.29024073675556417, + "learning_rate": 4.231673853390827e-05, + "loss": 0.5324, + "step": 677 + }, + { + "epoch": 0.7174603174603175, + "grad_norm": 0.28459105874291735, + "learning_rate": 4.229713837710702e-05, + "loss": 0.5723, + "step": 678 + }, + { + "epoch": 0.7185185185185186, + "grad_norm": 0.27008204854468826, + "learning_rate": 4.227753822030576e-05, + "loss": 0.5183, + "step": 679 + }, + { + "epoch": 0.7195767195767195, + "grad_norm": 0.32531200458934517, + "learning_rate": 4.225793806350451e-05, + "loss": 0.5417, + "step": 680 + }, + { + "epoch": 0.7206349206349206, + "grad_norm": 0.2827590448071415, + "learning_rate": 4.223833790670326e-05, + "loss": 0.5688, + "step": 681 + }, + { + "epoch": 0.7216931216931217, + "grad_norm": 0.32843041348454105, + "learning_rate": 4.2218737749901996e-05, + "loss": 0.4738, + "step": 682 + }, + { + "epoch": 0.7227513227513227, + "grad_norm": 0.29203603535869277, + "learning_rate": 4.2199137593100745e-05, + "loss": 0.571, + "step": 683 + }, + { + "epoch": 0.7238095238095238, + "grad_norm": 0.26235624444883343, + "learning_rate": 4.2179537436299495e-05, + "loss": 0.5339, + "step": 684 + }, + { + "epoch": 0.7248677248677249, + "grad_norm": 0.2743770991741004, + "learning_rate": 4.2159937279498244e-05, + "loss": 0.5938, + "step": 685 + }, + { + "epoch": 0.725925925925926, + "grad_norm": 0.28706543540832447, + "learning_rate": 4.214033712269698e-05, + "loss": 0.5088, + "step": 686 + }, + { + "epoch": 0.726984126984127, + "grad_norm": 0.2831332896571073, + "learning_rate": 4.212073696589573e-05, + "loss": 0.606, + "step": 687 + }, + { + "epoch": 0.728042328042328, + "grad_norm": 0.262186187511753, + "learning_rate": 4.210113680909448e-05, + "loss": 0.5341, + "step": 688 + }, + { + "epoch": 0.7291005291005291, + "grad_norm": 0.3096610015904226, + "learning_rate": 4.208153665229322e-05, + "loss": 0.5336, + "step": 689 + }, + { + "epoch": 0.7301587301587301, + "grad_norm": 0.2733332314767257, + "learning_rate": 4.206193649549196e-05, + "loss": 0.5145, + "step": 690 + }, + { + "epoch": 0.7312169312169312, + "grad_norm": 0.2438987308317826, + "learning_rate": 4.204233633869071e-05, + "loss": 0.5574, + "step": 691 + }, + { + "epoch": 0.7322751322751323, + "grad_norm": 0.29975311338331406, + "learning_rate": 4.2022736181889455e-05, + "loss": 0.5676, + "step": 692 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.3159755996646559, + "learning_rate": 4.2003136025088204e-05, + "loss": 0.6154, + "step": 693 + }, + { + "epoch": 0.7343915343915344, + "grad_norm": 0.24416951903546183, + "learning_rate": 4.1983535868286947e-05, + "loss": 0.5248, + "step": 694 + }, + { + "epoch": 0.7354497354497355, + "grad_norm": 0.24947362261992206, + "learning_rate": 4.196393571148569e-05, + "loss": 0.5121, + "step": 695 + }, + { + "epoch": 0.7365079365079366, + "grad_norm": 0.30381658421133123, + "learning_rate": 4.194433555468444e-05, + "loss": 0.583, + "step": 696 + }, + { + "epoch": 0.7375661375661375, + "grad_norm": 0.23096151371713622, + "learning_rate": 4.192473539788319e-05, + "loss": 0.5245, + "step": 697 + }, + { + "epoch": 0.7386243386243386, + "grad_norm": 0.3162192673911441, + "learning_rate": 4.190513524108193e-05, + "loss": 0.5468, + "step": 698 + }, + { + "epoch": 0.7396825396825397, + "grad_norm": 0.25182522028970394, + "learning_rate": 4.188553508428067e-05, + "loss": 0.597, + "step": 699 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.2684334525549805, + "learning_rate": 4.186593492747942e-05, + "loss": 0.576, + "step": 700 + }, + { + "epoch": 0.7417989417989418, + "grad_norm": 1.523561308125903, + "learning_rate": 4.184633477067817e-05, + "loss": 0.5329, + "step": 701 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.28565441595265484, + "learning_rate": 4.1826734613876914e-05, + "loss": 0.5196, + "step": 702 + }, + { + "epoch": 0.7439153439153439, + "grad_norm": 0.2888862478367132, + "learning_rate": 4.1807134457075656e-05, + "loss": 0.6255, + "step": 703 + }, + { + "epoch": 0.744973544973545, + "grad_norm": 0.3007510062268032, + "learning_rate": 4.1787534300274405e-05, + "loss": 0.5565, + "step": 704 + }, + { + "epoch": 0.746031746031746, + "grad_norm": 0.27427995824764073, + "learning_rate": 4.176793414347315e-05, + "loss": 0.5533, + "step": 705 + }, + { + "epoch": 0.7470899470899471, + "grad_norm": 0.24178542578986884, + "learning_rate": 4.17483339866719e-05, + "loss": 0.4839, + "step": 706 + }, + { + "epoch": 0.7481481481481481, + "grad_norm": 0.3447762627838107, + "learning_rate": 4.172873382987064e-05, + "loss": 0.5418, + "step": 707 + }, + { + "epoch": 0.7492063492063492, + "grad_norm": 0.28512746124111127, + "learning_rate": 4.170913367306938e-05, + "loss": 0.5593, + "step": 708 + }, + { + "epoch": 0.7502645502645503, + "grad_norm": 0.25654137696146834, + "learning_rate": 4.168953351626813e-05, + "loss": 0.5308, + "step": 709 + }, + { + "epoch": 0.7513227513227513, + "grad_norm": 0.3162447256848906, + "learning_rate": 4.166993335946688e-05, + "loss": 0.5092, + "step": 710 + }, + { + "epoch": 0.7523809523809524, + "grad_norm": 0.2706709395661285, + "learning_rate": 4.165033320266562e-05, + "loss": 0.549, + "step": 711 + }, + { + "epoch": 0.7534391534391535, + "grad_norm": 0.27951783492610943, + "learning_rate": 4.1630733045864365e-05, + "loss": 0.5039, + "step": 712 + }, + { + "epoch": 0.7544973544973544, + "grad_norm": 0.34542932566254875, + "learning_rate": 4.1611132889063115e-05, + "loss": 0.5269, + "step": 713 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.2666243377680122, + "learning_rate": 4.1591532732261864e-05, + "loss": 0.5833, + "step": 714 + }, + { + "epoch": 0.7566137566137566, + "grad_norm": 0.3245207820226863, + "learning_rate": 4.1571932575460606e-05, + "loss": 0.5764, + "step": 715 + }, + { + "epoch": 0.7576719576719577, + "grad_norm": 0.26147609233769936, + "learning_rate": 4.155233241865935e-05, + "loss": 0.5527, + "step": 716 + }, + { + "epoch": 0.7587301587301587, + "grad_norm": 0.2584431704263216, + "learning_rate": 4.15327322618581e-05, + "loss": 0.5637, + "step": 717 + }, + { + "epoch": 0.7597883597883598, + "grad_norm": 0.2731600454527648, + "learning_rate": 4.151313210505684e-05, + "loss": 0.6257, + "step": 718 + }, + { + "epoch": 0.7608465608465609, + "grad_norm": 0.2430610169832162, + "learning_rate": 4.149353194825559e-05, + "loss": 0.5118, + "step": 719 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.46705609114743374, + "learning_rate": 4.147393179145433e-05, + "loss": 0.5657, + "step": 720 + }, + { + "epoch": 0.762962962962963, + "grad_norm": 0.2765619748725685, + "learning_rate": 4.145433163465308e-05, + "loss": 0.5653, + "step": 721 + }, + { + "epoch": 0.764021164021164, + "grad_norm": 0.2445879143109297, + "learning_rate": 4.1434731477851824e-05, + "loss": 0.5549, + "step": 722 + }, + { + "epoch": 0.765079365079365, + "grad_norm": 0.29444717176229945, + "learning_rate": 4.1415131321050573e-05, + "loss": 0.5159, + "step": 723 + }, + { + "epoch": 0.7661375661375661, + "grad_norm": 0.23188418245940617, + "learning_rate": 4.1395531164249316e-05, + "loss": 0.5596, + "step": 724 + }, + { + "epoch": 0.7671957671957672, + "grad_norm": 0.2726530916672444, + "learning_rate": 4.137593100744806e-05, + "loss": 0.5596, + "step": 725 + }, + { + "epoch": 0.7682539682539683, + "grad_norm": 0.26947956374326953, + "learning_rate": 4.135633085064681e-05, + "loss": 0.5813, + "step": 726 + }, + { + "epoch": 0.7693121693121693, + "grad_norm": 0.23506890162797972, + "learning_rate": 4.133673069384556e-05, + "loss": 0.5437, + "step": 727 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 2.972237291768401, + "learning_rate": 4.13171305370443e-05, + "loss": 0.5998, + "step": 728 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 0.27441250036308384, + "learning_rate": 4.129753038024304e-05, + "loss": 0.5271, + "step": 729 + }, + { + "epoch": 0.7724867724867724, + "grad_norm": 0.2522029487623296, + "learning_rate": 4.127793022344179e-05, + "loss": 0.5339, + "step": 730 + }, + { + "epoch": 0.7735449735449735, + "grad_norm": 0.25217726436063065, + "learning_rate": 4.1258330066640534e-05, + "loss": 0.578, + "step": 731 + }, + { + "epoch": 0.7746031746031746, + "grad_norm": 0.25344645901788887, + "learning_rate": 4.123872990983928e-05, + "loss": 0.5418, + "step": 732 + }, + { + "epoch": 0.7756613756613756, + "grad_norm": 0.2396746178638489, + "learning_rate": 4.1219129753038025e-05, + "loss": 0.5244, + "step": 733 + }, + { + "epoch": 0.7767195767195767, + "grad_norm": 0.31881877046038376, + "learning_rate": 4.1199529596236775e-05, + "loss": 0.5785, + "step": 734 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.23060282015738198, + "learning_rate": 4.117992943943552e-05, + "loss": 0.4795, + "step": 735 + }, + { + "epoch": 0.7788359788359789, + "grad_norm": 0.26057461199701626, + "learning_rate": 4.1160329282634266e-05, + "loss": 0.4967, + "step": 736 + }, + { + "epoch": 0.7798941798941799, + "grad_norm": 0.24273599035257443, + "learning_rate": 4.114072912583301e-05, + "loss": 0.5449, + "step": 737 + }, + { + "epoch": 0.780952380952381, + "grad_norm": 0.23318919061727453, + "learning_rate": 4.112112896903175e-05, + "loss": 0.4872, + "step": 738 + }, + { + "epoch": 0.782010582010582, + "grad_norm": 0.28999779048722424, + "learning_rate": 4.11015288122305e-05, + "loss": 0.6129, + "step": 739 + }, + { + "epoch": 0.783068783068783, + "grad_norm": 0.25705614998142, + "learning_rate": 4.108192865542925e-05, + "loss": 0.5755, + "step": 740 + }, + { + "epoch": 0.7841269841269841, + "grad_norm": 0.24516780262995658, + "learning_rate": 4.1062328498627986e-05, + "loss": 0.5313, + "step": 741 + }, + { + "epoch": 0.7851851851851852, + "grad_norm": 0.2632105354730925, + "learning_rate": 4.1042728341826735e-05, + "loss": 0.4978, + "step": 742 + }, + { + "epoch": 0.7862433862433862, + "grad_norm": 0.2950535479592732, + "learning_rate": 4.1023128185025484e-05, + "loss": 0.5477, + "step": 743 + }, + { + "epoch": 0.7873015873015873, + "grad_norm": 0.2890918764180236, + "learning_rate": 4.100352802822423e-05, + "loss": 0.6027, + "step": 744 + }, + { + "epoch": 0.7883597883597884, + "grad_norm": 0.24905516173631076, + "learning_rate": 4.098392787142297e-05, + "loss": 0.4916, + "step": 745 + }, + { + "epoch": 0.7894179894179895, + "grad_norm": 0.30918231275089586, + "learning_rate": 4.096432771462172e-05, + "loss": 0.5521, + "step": 746 + }, + { + "epoch": 0.7904761904761904, + "grad_norm": 0.254678981001304, + "learning_rate": 4.094472755782047e-05, + "loss": 0.5693, + "step": 747 + }, + { + "epoch": 0.7915343915343915, + "grad_norm": 0.24323586920042417, + "learning_rate": 4.092512740101921e-05, + "loss": 0.5497, + "step": 748 + }, + { + "epoch": 0.7925925925925926, + "grad_norm": 0.2888285005504316, + "learning_rate": 4.090552724421795e-05, + "loss": 0.4983, + "step": 749 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 0.259386381669663, + "learning_rate": 4.08859270874167e-05, + "loss": 0.6195, + "step": 750 + }, + { + "epoch": 0.7947089947089947, + "grad_norm": 0.23036461293567234, + "learning_rate": 4.0866326930615444e-05, + "loss": 0.5492, + "step": 751 + }, + { + "epoch": 0.7957671957671958, + "grad_norm": 0.2678855648948368, + "learning_rate": 4.0846726773814194e-05, + "loss": 0.5072, + "step": 752 + }, + { + "epoch": 0.7968253968253968, + "grad_norm": 0.2887180222360751, + "learning_rate": 4.082712661701294e-05, + "loss": 0.5246, + "step": 753 + }, + { + "epoch": 0.7978835978835979, + "grad_norm": 0.2568292309960633, + "learning_rate": 4.080752646021168e-05, + "loss": 0.5184, + "step": 754 + }, + { + "epoch": 0.798941798941799, + "grad_norm": 0.24711251400823, + "learning_rate": 4.078792630341043e-05, + "loss": 0.5341, + "step": 755 + }, + { + "epoch": 0.8, + "grad_norm": 0.31036770266770836, + "learning_rate": 4.076832614660918e-05, + "loss": 0.6273, + "step": 756 + }, + { + "epoch": 0.801058201058201, + "grad_norm": 0.2571757220271915, + "learning_rate": 4.074872598980792e-05, + "loss": 0.5696, + "step": 757 + }, + { + "epoch": 0.8021164021164021, + "grad_norm": 0.26980125079686357, + "learning_rate": 4.072912583300666e-05, + "loss": 0.5467, + "step": 758 + }, + { + "epoch": 0.8031746031746032, + "grad_norm": 0.24951832038119673, + "learning_rate": 4.070952567620541e-05, + "loss": 0.5083, + "step": 759 + }, + { + "epoch": 0.8042328042328042, + "grad_norm": 0.25244068781335705, + "learning_rate": 4.068992551940416e-05, + "loss": 0.4868, + "step": 760 + }, + { + "epoch": 0.8052910052910053, + "grad_norm": 0.24075958410510967, + "learning_rate": 4.06703253626029e-05, + "loss": 0.5132, + "step": 761 + }, + { + "epoch": 0.8063492063492064, + "grad_norm": 0.299818761325055, + "learning_rate": 4.0650725205801646e-05, + "loss": 0.5085, + "step": 762 + }, + { + "epoch": 0.8074074074074075, + "grad_norm": 0.2556890383489905, + "learning_rate": 4.0631125049000395e-05, + "loss": 0.6116, + "step": 763 + }, + { + "epoch": 0.8084656084656084, + "grad_norm": 0.25781514459024585, + "learning_rate": 4.061152489219914e-05, + "loss": 0.5138, + "step": 764 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.4812654512833488, + "learning_rate": 4.059192473539789e-05, + "loss": 0.5117, + "step": 765 + }, + { + "epoch": 0.8105820105820106, + "grad_norm": 0.43995941221107904, + "learning_rate": 4.057232457859663e-05, + "loss": 0.523, + "step": 766 + }, + { + "epoch": 0.8116402116402116, + "grad_norm": 0.2570086400263589, + "learning_rate": 4.055272442179537e-05, + "loss": 0.4922, + "step": 767 + }, + { + "epoch": 0.8126984126984127, + "grad_norm": 0.27355262696697646, + "learning_rate": 4.053312426499412e-05, + "loss": 0.5214, + "step": 768 + }, + { + "epoch": 0.8137566137566138, + "grad_norm": 0.30929404940417793, + "learning_rate": 4.051352410819287e-05, + "loss": 0.5595, + "step": 769 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.25651422442360594, + "learning_rate": 4.049392395139161e-05, + "loss": 0.5977, + "step": 770 + }, + { + "epoch": 0.8158730158730159, + "grad_norm": 0.3254465197758939, + "learning_rate": 4.0474323794590355e-05, + "loss": 0.5534, + "step": 771 + }, + { + "epoch": 0.816931216931217, + "grad_norm": 0.2859692925402336, + "learning_rate": 4.0454723637789104e-05, + "loss": 0.56, + "step": 772 + }, + { + "epoch": 0.817989417989418, + "grad_norm": 0.24530950245641758, + "learning_rate": 4.0435123480987854e-05, + "loss": 0.4727, + "step": 773 + }, + { + "epoch": 0.819047619047619, + "grad_norm": 0.2852863920952454, + "learning_rate": 4.0415523324186596e-05, + "loss": 0.5166, + "step": 774 + }, + { + "epoch": 0.8201058201058201, + "grad_norm": 0.28797276683526324, + "learning_rate": 4.039592316738534e-05, + "loss": 0.6108, + "step": 775 + }, + { + "epoch": 0.8211640211640212, + "grad_norm": 0.24315570228929462, + "learning_rate": 4.037632301058409e-05, + "loss": 0.5383, + "step": 776 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.27806729681229697, + "learning_rate": 4.035672285378283e-05, + "loss": 0.5445, + "step": 777 + }, + { + "epoch": 0.8232804232804233, + "grad_norm": 0.27598442145764895, + "learning_rate": 4.033712269698158e-05, + "loss": 0.4656, + "step": 778 + }, + { + "epoch": 0.8243386243386244, + "grad_norm": 0.22788260383702866, + "learning_rate": 4.031752254018032e-05, + "loss": 0.4989, + "step": 779 + }, + { + "epoch": 0.8253968253968254, + "grad_norm": 0.2799668545860428, + "learning_rate": 4.0297922383379065e-05, + "loss": 0.6928, + "step": 780 + }, + { + "epoch": 0.8264550264550264, + "grad_norm": 0.2393669331758126, + "learning_rate": 4.0278322226577814e-05, + "loss": 0.4514, + "step": 781 + }, + { + "epoch": 0.8275132275132275, + "grad_norm": 0.2625212858096316, + "learning_rate": 4.025872206977656e-05, + "loss": 0.5826, + "step": 782 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.28090132618519864, + "learning_rate": 4.0239121912975306e-05, + "loss": 0.5763, + "step": 783 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 0.254545648971496, + "learning_rate": 4.021952175617405e-05, + "loss": 0.5603, + "step": 784 + }, + { + "epoch": 0.8306878306878307, + "grad_norm": 0.3026212789118765, + "learning_rate": 4.01999215993728e-05, + "loss": 0.5572, + "step": 785 + }, + { + "epoch": 0.8317460317460318, + "grad_norm": 0.2583084756689895, + "learning_rate": 4.018032144257155e-05, + "loss": 0.5687, + "step": 786 + }, + { + "epoch": 0.8328042328042328, + "grad_norm": 0.2912070717602156, + "learning_rate": 4.016072128577029e-05, + "loss": 0.4976, + "step": 787 + }, + { + "epoch": 0.8338624338624339, + "grad_norm": 0.3091276282835055, + "learning_rate": 4.014112112896903e-05, + "loss": 0.602, + "step": 788 + }, + { + "epoch": 0.834920634920635, + "grad_norm": 0.2614685006059475, + "learning_rate": 4.012152097216778e-05, + "loss": 0.6047, + "step": 789 + }, + { + "epoch": 0.8359788359788359, + "grad_norm": 0.2976678306406433, + "learning_rate": 4.010192081536652e-05, + "loss": 0.5572, + "step": 790 + }, + { + "epoch": 0.837037037037037, + "grad_norm": 0.26695480665010896, + "learning_rate": 4.008232065856527e-05, + "loss": 0.5368, + "step": 791 + }, + { + "epoch": 0.8380952380952381, + "grad_norm": 0.27224551418357, + "learning_rate": 4.0062720501764015e-05, + "loss": 0.5715, + "step": 792 + }, + { + "epoch": 0.8391534391534392, + "grad_norm": 0.24476160512891385, + "learning_rate": 4.004312034496276e-05, + "loss": 0.5734, + "step": 793 + }, + { + "epoch": 0.8402116402116402, + "grad_norm": 0.26511024491224705, + "learning_rate": 4.002352018816151e-05, + "loss": 0.5795, + "step": 794 + }, + { + "epoch": 0.8412698412698413, + "grad_norm": 0.34773604650933276, + "learning_rate": 4.0003920031360256e-05, + "loss": 0.5005, + "step": 795 + }, + { + "epoch": 0.8423280423280424, + "grad_norm": 0.2615336683961906, + "learning_rate": 3.9984319874559e-05, + "loss": 0.5364, + "step": 796 + }, + { + "epoch": 0.8433862433862434, + "grad_norm": 0.2739470851171991, + "learning_rate": 3.996471971775774e-05, + "loss": 0.4933, + "step": 797 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.3482221259725657, + "learning_rate": 3.994511956095649e-05, + "loss": 0.6087, + "step": 798 + }, + { + "epoch": 0.8455026455026455, + "grad_norm": 7.054547078978019, + "learning_rate": 3.992551940415524e-05, + "loss": 0.5531, + "step": 799 + }, + { + "epoch": 0.8465608465608465, + "grad_norm": 0.3025752493200541, + "learning_rate": 3.990591924735398e-05, + "loss": 0.5327, + "step": 800 + }, + { + "epoch": 0.8476190476190476, + "grad_norm": 0.2515881041243724, + "learning_rate": 3.9886319090552725e-05, + "loss": 0.5089, + "step": 801 + }, + { + "epoch": 0.8486772486772487, + "grad_norm": 0.29447997518392294, + "learning_rate": 3.9866718933751474e-05, + "loss": 0.5635, + "step": 802 + }, + { + "epoch": 0.8497354497354498, + "grad_norm": 1.2056063054042254, + "learning_rate": 3.9847118776950216e-05, + "loss": 0.568, + "step": 803 + }, + { + "epoch": 0.8507936507936508, + "grad_norm": 0.24867885592706862, + "learning_rate": 3.9827518620148966e-05, + "loss": 0.5172, + "step": 804 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.2833308927773852, + "learning_rate": 3.980791846334771e-05, + "loss": 0.5552, + "step": 805 + }, + { + "epoch": 0.852910052910053, + "grad_norm": 0.43186449303900065, + "learning_rate": 3.978831830654645e-05, + "loss": 0.4626, + "step": 806 + }, + { + "epoch": 0.8539682539682539, + "grad_norm": 0.28455060749504574, + "learning_rate": 3.97687181497452e-05, + "loss": 0.5228, + "step": 807 + }, + { + "epoch": 0.855026455026455, + "grad_norm": 0.29552626608893695, + "learning_rate": 3.974911799294395e-05, + "loss": 0.5452, + "step": 808 + }, + { + "epoch": 0.8560846560846561, + "grad_norm": 0.24084287912602692, + "learning_rate": 3.972951783614269e-05, + "loss": 0.54, + "step": 809 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.28708908508185177, + "learning_rate": 3.9709917679341434e-05, + "loss": 0.524, + "step": 810 + }, + { + "epoch": 0.8582010582010582, + "grad_norm": 0.26965168845733906, + "learning_rate": 3.969031752254018e-05, + "loss": 0.5084, + "step": 811 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 0.27200841727390407, + "learning_rate": 3.967071736573893e-05, + "loss": 0.6633, + "step": 812 + }, + { + "epoch": 0.8603174603174604, + "grad_norm": 0.4577420835451772, + "learning_rate": 3.965111720893767e-05, + "loss": 0.5984, + "step": 813 + }, + { + "epoch": 0.8613756613756614, + "grad_norm": 0.23226446549107316, + "learning_rate": 3.963151705213642e-05, + "loss": 0.5383, + "step": 814 + }, + { + "epoch": 0.8624338624338624, + "grad_norm": 0.24732625840438446, + "learning_rate": 3.961191689533517e-05, + "loss": 0.4663, + "step": 815 + }, + { + "epoch": 0.8634920634920635, + "grad_norm": 0.2442585038196715, + "learning_rate": 3.959231673853391e-05, + "loss": 0.4915, + "step": 816 + }, + { + "epoch": 0.8645502645502645, + "grad_norm": 0.24133270544480462, + "learning_rate": 3.957271658173265e-05, + "loss": 0.5099, + "step": 817 + }, + { + "epoch": 0.8656084656084656, + "grad_norm": 0.2375341297040655, + "learning_rate": 3.95531164249314e-05, + "loss": 0.4853, + "step": 818 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.2374457163459558, + "learning_rate": 3.953351626813015e-05, + "loss": 0.497, + "step": 819 + }, + { + "epoch": 0.8677248677248677, + "grad_norm": 0.2944923584520418, + "learning_rate": 3.951391611132889e-05, + "loss": 0.5841, + "step": 820 + }, + { + "epoch": 0.8687830687830688, + "grad_norm": 0.3103954396715373, + "learning_rate": 3.9494315954527635e-05, + "loss": 0.6308, + "step": 821 + }, + { + "epoch": 0.8698412698412699, + "grad_norm": 0.3044635700570842, + "learning_rate": 3.9474715797726385e-05, + "loss": 0.6198, + "step": 822 + }, + { + "epoch": 0.870899470899471, + "grad_norm": 0.25175807643491943, + "learning_rate": 3.945511564092513e-05, + "loss": 0.5508, + "step": 823 + }, + { + "epoch": 0.8719576719576719, + "grad_norm": 0.22937033545757074, + "learning_rate": 3.9435515484123876e-05, + "loss": 0.5033, + "step": 824 + }, + { + "epoch": 0.873015873015873, + "grad_norm": 0.2414211430197171, + "learning_rate": 3.9415915327322626e-05, + "loss": 0.5281, + "step": 825 + }, + { + "epoch": 0.8740740740740741, + "grad_norm": 0.36303292523703895, + "learning_rate": 3.939631517052136e-05, + "loss": 0.4688, + "step": 826 + }, + { + "epoch": 0.8751322751322751, + "grad_norm": 0.2840607840231142, + "learning_rate": 3.937671501372011e-05, + "loss": 0.5765, + "step": 827 + }, + { + "epoch": 0.8761904761904762, + "grad_norm": 0.26470574352059545, + "learning_rate": 3.935711485691886e-05, + "loss": 0.5827, + "step": 828 + }, + { + "epoch": 0.8772486772486773, + "grad_norm": 0.23448435580139299, + "learning_rate": 3.93375147001176e-05, + "loss": 0.4977, + "step": 829 + }, + { + "epoch": 0.8783068783068783, + "grad_norm": 0.2635026772188355, + "learning_rate": 3.9317914543316345e-05, + "loss": 0.521, + "step": 830 + }, + { + "epoch": 0.8793650793650793, + "grad_norm": 0.26987056951962557, + "learning_rate": 3.9298314386515094e-05, + "loss": 0.5562, + "step": 831 + }, + { + "epoch": 0.8804232804232804, + "grad_norm": 0.2365896896674794, + "learning_rate": 3.927871422971384e-05, + "loss": 0.5206, + "step": 832 + }, + { + "epoch": 0.8814814814814815, + "grad_norm": 0.2617025654901415, + "learning_rate": 3.9259114072912586e-05, + "loss": 0.53, + "step": 833 + }, + { + "epoch": 0.8825396825396825, + "grad_norm": 0.24319463532557173, + "learning_rate": 3.923951391611133e-05, + "loss": 0.5784, + "step": 834 + }, + { + "epoch": 0.8835978835978836, + "grad_norm": 0.30018809738193136, + "learning_rate": 3.921991375931008e-05, + "loss": 0.5594, + "step": 835 + }, + { + "epoch": 0.8846560846560847, + "grad_norm": 0.24564159380840794, + "learning_rate": 3.920031360250882e-05, + "loss": 0.5678, + "step": 836 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.24469707503949964, + "learning_rate": 3.918071344570757e-05, + "loss": 0.6056, + "step": 837 + }, + { + "epoch": 0.8867724867724868, + "grad_norm": 0.28670136066959284, + "learning_rate": 3.916111328890631e-05, + "loss": 0.503, + "step": 838 + }, + { + "epoch": 0.8878306878306879, + "grad_norm": 0.25036374517538473, + "learning_rate": 3.9141513132105054e-05, + "loss": 0.536, + "step": 839 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.22936770624952652, + "learning_rate": 3.9121912975303804e-05, + "loss": 0.4632, + "step": 840 + }, + { + "epoch": 0.8899470899470899, + "grad_norm": 0.23888772118569762, + "learning_rate": 3.910231281850255e-05, + "loss": 0.539, + "step": 841 + }, + { + "epoch": 0.891005291005291, + "grad_norm": 0.26618130929626477, + "learning_rate": 3.9082712661701295e-05, + "loss": 0.5135, + "step": 842 + }, + { + "epoch": 0.8920634920634921, + "grad_norm": 0.23048542050631554, + "learning_rate": 3.906311250490004e-05, + "loss": 0.4616, + "step": 843 + }, + { + "epoch": 0.8931216931216931, + "grad_norm": 0.23846574548002752, + "learning_rate": 3.904351234809879e-05, + "loss": 0.56, + "step": 844 + }, + { + "epoch": 0.8941798941798942, + "grad_norm": 0.25578679656745784, + "learning_rate": 3.9023912191297536e-05, + "loss": 0.5171, + "step": 845 + }, + { + "epoch": 0.8952380952380953, + "grad_norm": 0.2604178853945741, + "learning_rate": 3.900431203449628e-05, + "loss": 0.4736, + "step": 846 + }, + { + "epoch": 0.8962962962962963, + "grad_norm": 0.2543159342822874, + "learning_rate": 3.898471187769502e-05, + "loss": 0.553, + "step": 847 + }, + { + "epoch": 0.8973544973544973, + "grad_norm": 0.25668454175903266, + "learning_rate": 3.896511172089377e-05, + "loss": 0.5986, + "step": 848 + }, + { + "epoch": 0.8984126984126984, + "grad_norm": 0.27128711537893746, + "learning_rate": 3.894551156409251e-05, + "loss": 0.6013, + "step": 849 + }, + { + "epoch": 0.8994708994708994, + "grad_norm": 0.25661383335622456, + "learning_rate": 3.892591140729126e-05, + "loss": 0.6018, + "step": 850 + }, + { + "epoch": 0.9005291005291005, + "grad_norm": 0.26144285059562283, + "learning_rate": 3.8906311250490005e-05, + "loss": 0.5544, + "step": 851 + }, + { + "epoch": 0.9015873015873016, + "grad_norm": 0.24851759763750786, + "learning_rate": 3.888671109368875e-05, + "loss": 0.5417, + "step": 852 + }, + { + "epoch": 0.9026455026455027, + "grad_norm": 0.22366950486260603, + "learning_rate": 3.8867110936887497e-05, + "loss": 0.5216, + "step": 853 + }, + { + "epoch": 0.9037037037037037, + "grad_norm": 0.25522407995231283, + "learning_rate": 3.8847510780086246e-05, + "loss": 0.5511, + "step": 854 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.2459244748277179, + "learning_rate": 3.882791062328499e-05, + "loss": 0.4738, + "step": 855 + }, + { + "epoch": 0.9058201058201059, + "grad_norm": 0.23773875962661492, + "learning_rate": 3.880831046648373e-05, + "loss": 0.5866, + "step": 856 + }, + { + "epoch": 0.9068783068783068, + "grad_norm": 0.2580145368233666, + "learning_rate": 3.878871030968248e-05, + "loss": 0.5448, + "step": 857 + }, + { + "epoch": 0.9079365079365079, + "grad_norm": 0.24118543676000287, + "learning_rate": 3.876911015288123e-05, + "loss": 0.5223, + "step": 858 + }, + { + "epoch": 0.908994708994709, + "grad_norm": 0.24514536124293135, + "learning_rate": 3.874950999607997e-05, + "loss": 0.5641, + "step": 859 + }, + { + "epoch": 0.91005291005291, + "grad_norm": 0.23032586872284455, + "learning_rate": 3.8729909839278714e-05, + "loss": 0.5285, + "step": 860 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.24435066352876703, + "learning_rate": 3.8710309682477464e-05, + "loss": 0.5423, + "step": 861 + }, + { + "epoch": 0.9121693121693122, + "grad_norm": 0.2622407130624908, + "learning_rate": 3.8690709525676206e-05, + "loss": 0.5741, + "step": 862 + }, + { + "epoch": 0.9132275132275133, + "grad_norm": 1.3636871490670779, + "learning_rate": 3.8671109368874955e-05, + "loss": 0.7745, + "step": 863 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3780723682991901, + "learning_rate": 3.86515092120737e-05, + "loss": 0.5085, + "step": 864 + }, + { + "epoch": 0.9153439153439153, + "grad_norm": 0.24726218347601356, + "learning_rate": 3.863190905527244e-05, + "loss": 0.5712, + "step": 865 + }, + { + "epoch": 0.9164021164021164, + "grad_norm": 0.27126893307687105, + "learning_rate": 3.861230889847119e-05, + "loss": 0.5255, + "step": 866 + }, + { + "epoch": 0.9174603174603174, + "grad_norm": 0.2426515970790078, + "learning_rate": 3.859270874166994e-05, + "loss": 0.5768, + "step": 867 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 0.28594907335057773, + "learning_rate": 3.857310858486868e-05, + "loss": 0.5528, + "step": 868 + }, + { + "epoch": 0.9195767195767196, + "grad_norm": 0.23606697714730085, + "learning_rate": 3.8553508428067424e-05, + "loss": 0.5133, + "step": 869 + }, + { + "epoch": 0.9206349206349206, + "grad_norm": 0.27926385748395915, + "learning_rate": 3.853390827126617e-05, + "loss": 0.5796, + "step": 870 + }, + { + "epoch": 0.9216931216931217, + "grad_norm": 0.24343714813051942, + "learning_rate": 3.851430811446492e-05, + "loss": 0.5295, + "step": 871 + }, + { + "epoch": 0.9227513227513228, + "grad_norm": 0.26244227600390824, + "learning_rate": 3.8494707957663665e-05, + "loss": 0.4905, + "step": 872 + }, + { + "epoch": 0.9238095238095239, + "grad_norm": 0.2765749086021136, + "learning_rate": 3.847510780086241e-05, + "loss": 0.5227, + "step": 873 + }, + { + "epoch": 0.9248677248677248, + "grad_norm": 0.22924047622529786, + "learning_rate": 3.8455507644061157e-05, + "loss": 0.5447, + "step": 874 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.26004927012669626, + "learning_rate": 3.84359074872599e-05, + "loss": 0.5876, + "step": 875 + }, + { + "epoch": 0.926984126984127, + "grad_norm": 0.23257781793426832, + "learning_rate": 3.841630733045865e-05, + "loss": 0.4462, + "step": 876 + }, + { + "epoch": 0.928042328042328, + "grad_norm": 0.23530057335980378, + "learning_rate": 3.839670717365739e-05, + "loss": 0.5099, + "step": 877 + }, + { + "epoch": 0.9291005291005291, + "grad_norm": 0.22869829817586848, + "learning_rate": 3.837710701685613e-05, + "loss": 0.4933, + "step": 878 + }, + { + "epoch": 0.9301587301587302, + "grad_norm": 0.2538072480043844, + "learning_rate": 3.835750686005488e-05, + "loss": 0.4838, + "step": 879 + }, + { + "epoch": 0.9312169312169312, + "grad_norm": 0.23947855637218263, + "learning_rate": 3.833790670325363e-05, + "loss": 0.5569, + "step": 880 + }, + { + "epoch": 0.9322751322751323, + "grad_norm": 0.2379463855293634, + "learning_rate": 3.8318306546452374e-05, + "loss": 0.5169, + "step": 881 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.2706377388516219, + "learning_rate": 3.829870638965112e-05, + "loss": 0.4809, + "step": 882 + }, + { + "epoch": 0.9343915343915344, + "grad_norm": 0.2599921459464922, + "learning_rate": 3.8279106232849866e-05, + "loss": 0.5511, + "step": 883 + }, + { + "epoch": 0.9354497354497354, + "grad_norm": 0.23700018733026876, + "learning_rate": 3.8259506076048615e-05, + "loss": 0.5508, + "step": 884 + }, + { + "epoch": 0.9365079365079365, + "grad_norm": 0.27148173561209227, + "learning_rate": 3.823990591924735e-05, + "loss": 0.5598, + "step": 885 + }, + { + "epoch": 0.9375661375661376, + "grad_norm": 0.23284787985918257, + "learning_rate": 3.82203057624461e-05, + "loss": 0.4889, + "step": 886 + }, + { + "epoch": 0.9386243386243386, + "grad_norm": 0.2289097690533918, + "learning_rate": 3.820070560564485e-05, + "loss": 0.484, + "step": 887 + }, + { + "epoch": 0.9396825396825397, + "grad_norm": 0.29024067491700634, + "learning_rate": 3.818110544884359e-05, + "loss": 0.527, + "step": 888 + }, + { + "epoch": 0.9407407407407408, + "grad_norm": 0.2573481910146356, + "learning_rate": 3.8161505292042334e-05, + "loss": 0.4921, + "step": 889 + }, + { + "epoch": 0.9417989417989417, + "grad_norm": 0.2287016832994264, + "learning_rate": 3.8141905135241084e-05, + "loss": 0.4975, + "step": 890 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.2793182907010499, + "learning_rate": 3.8122304978439826e-05, + "loss": 0.5901, + "step": 891 + }, + { + "epoch": 0.9439153439153439, + "grad_norm": 0.2815450954484471, + "learning_rate": 3.8102704821638575e-05, + "loss": 0.4907, + "step": 892 + }, + { + "epoch": 0.944973544973545, + "grad_norm": 0.24825039265577378, + "learning_rate": 3.808310466483732e-05, + "loss": 0.5845, + "step": 893 + }, + { + "epoch": 0.946031746031746, + "grad_norm": 0.2684156933456286, + "learning_rate": 3.806350450803607e-05, + "loss": 0.4972, + "step": 894 + }, + { + "epoch": 0.9470899470899471, + "grad_norm": 0.2777947126936837, + "learning_rate": 3.804390435123481e-05, + "loss": 0.5588, + "step": 895 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 0.2479091360698014, + "learning_rate": 3.802430419443356e-05, + "loss": 0.5882, + "step": 896 + }, + { + "epoch": 0.9492063492063492, + "grad_norm": 0.2657834372378511, + "learning_rate": 3.800470403763231e-05, + "loss": 0.5495, + "step": 897 + }, + { + "epoch": 0.9502645502645503, + "grad_norm": 0.9282022073586812, + "learning_rate": 3.7985103880831044e-05, + "loss": 0.5746, + "step": 898 + }, + { + "epoch": 0.9513227513227513, + "grad_norm": 0.25172528574506164, + "learning_rate": 3.796550372402979e-05, + "loss": 0.4896, + "step": 899 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.2555445883398386, + "learning_rate": 3.794590356722854e-05, + "loss": 0.5736, + "step": 900 + }, + { + "epoch": 0.9534391534391534, + "grad_norm": 0.249355354605511, + "learning_rate": 3.7926303410427285e-05, + "loss": 0.4817, + "step": 901 + }, + { + "epoch": 0.9544973544973545, + "grad_norm": 0.2149812919434468, + "learning_rate": 3.790670325362603e-05, + "loss": 0.4524, + "step": 902 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.2703911118438873, + "learning_rate": 3.788710309682478e-05, + "loss": 0.5443, + "step": 903 + }, + { + "epoch": 0.9566137566137566, + "grad_norm": 0.223977288588384, + "learning_rate": 3.786750294002352e-05, + "loss": 0.4925, + "step": 904 + }, + { + "epoch": 0.9576719576719577, + "grad_norm": 0.27011460419403965, + "learning_rate": 3.784790278322227e-05, + "loss": 0.5812, + "step": 905 + }, + { + "epoch": 0.9587301587301588, + "grad_norm": 0.23957948838890472, + "learning_rate": 3.782830262642101e-05, + "loss": 0.5007, + "step": 906 + }, + { + "epoch": 0.9597883597883597, + "grad_norm": 0.2264533105409485, + "learning_rate": 3.780870246961976e-05, + "loss": 0.5141, + "step": 907 + }, + { + "epoch": 0.9608465608465608, + "grad_norm": 0.24934710629027704, + "learning_rate": 3.77891023128185e-05, + "loss": 0.5271, + "step": 908 + }, + { + "epoch": 0.9619047619047619, + "grad_norm": 0.24400597517616687, + "learning_rate": 3.776950215601725e-05, + "loss": 0.5524, + "step": 909 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.24087812537974612, + "learning_rate": 3.7749901999215994e-05, + "loss": 0.4046, + "step": 910 + }, + { + "epoch": 0.964021164021164, + "grad_norm": 2.6978887892991747, + "learning_rate": 3.773030184241474e-05, + "loss": 0.5807, + "step": 911 + }, + { + "epoch": 0.9650793650793651, + "grad_norm": 0.31338696022947576, + "learning_rate": 3.7710701685613486e-05, + "loss": 0.5033, + "step": 912 + }, + { + "epoch": 0.9661375661375662, + "grad_norm": 0.3061821128532338, + "learning_rate": 3.7691101528812235e-05, + "loss": 0.5131, + "step": 913 + }, + { + "epoch": 0.9671957671957672, + "grad_norm": 0.23119947689356746, + "learning_rate": 3.767150137201098e-05, + "loss": 0.4257, + "step": 914 + }, + { + "epoch": 0.9682539682539683, + "grad_norm": 0.27958747828152924, + "learning_rate": 3.765190121520972e-05, + "loss": 0.543, + "step": 915 + }, + { + "epoch": 0.9693121693121693, + "grad_norm": 0.30495291790476814, + "learning_rate": 3.763230105840847e-05, + "loss": 0.5112, + "step": 916 + }, + { + "epoch": 0.9703703703703703, + "grad_norm": 0.2815945843930539, + "learning_rate": 3.761270090160722e-05, + "loss": 0.6096, + "step": 917 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 0.27333650578935825, + "learning_rate": 3.759310074480596e-05, + "loss": 0.5376, + "step": 918 + }, + { + "epoch": 0.9724867724867725, + "grad_norm": 0.2982346639473295, + "learning_rate": 3.7573500588004704e-05, + "loss": 0.452, + "step": 919 + }, + { + "epoch": 0.9735449735449735, + "grad_norm": 0.2677922848009343, + "learning_rate": 3.755390043120345e-05, + "loss": 0.4959, + "step": 920 + }, + { + "epoch": 0.9746031746031746, + "grad_norm": 0.2598899505130424, + "learning_rate": 3.7534300274402196e-05, + "loss": 0.5954, + "step": 921 + }, + { + "epoch": 0.9756613756613757, + "grad_norm": 0.29688775573892967, + "learning_rate": 3.7514700117600945e-05, + "loss": 0.4943, + "step": 922 + }, + { + "epoch": 0.9767195767195768, + "grad_norm": 0.21278741966910175, + "learning_rate": 3.749509996079969e-05, + "loss": 0.469, + "step": 923 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.27458087883654253, + "learning_rate": 3.747549980399843e-05, + "loss": 0.5852, + "step": 924 + }, + { + "epoch": 0.9788359788359788, + "grad_norm": 0.26092414227041094, + "learning_rate": 3.745589964719718e-05, + "loss": 0.5247, + "step": 925 + }, + { + "epoch": 0.9798941798941799, + "grad_norm": 0.22726587133642012, + "learning_rate": 3.743629949039593e-05, + "loss": 0.5845, + "step": 926 + }, + { + "epoch": 0.9809523809523809, + "grad_norm": 0.24328946071436178, + "learning_rate": 3.741669933359467e-05, + "loss": 0.5947, + "step": 927 + }, + { + "epoch": 0.982010582010582, + "grad_norm": 0.25227625801156794, + "learning_rate": 3.7397099176793413e-05, + "loss": 0.5245, + "step": 928 + }, + { + "epoch": 0.9830687830687831, + "grad_norm": 1.8750127781404309, + "learning_rate": 3.737749901999216e-05, + "loss": 0.5496, + "step": 929 + }, + { + "epoch": 0.9841269841269841, + "grad_norm": 0.2690191369601627, + "learning_rate": 3.735789886319091e-05, + "loss": 0.5706, + "step": 930 + }, + { + "epoch": 0.9851851851851852, + "grad_norm": 0.23072633191225864, + "learning_rate": 3.7338298706389654e-05, + "loss": 0.5585, + "step": 931 + }, + { + "epoch": 0.9862433862433863, + "grad_norm": 0.24321911538830931, + "learning_rate": 3.73186985495884e-05, + "loss": 0.5314, + "step": 932 + }, + { + "epoch": 0.9873015873015873, + "grad_norm": 0.2265652217349377, + "learning_rate": 3.7299098392787146e-05, + "loss": 0.5157, + "step": 933 + }, + { + "epoch": 0.9883597883597883, + "grad_norm": 0.2798879913889636, + "learning_rate": 3.727949823598589e-05, + "loss": 0.527, + "step": 934 + }, + { + "epoch": 0.9894179894179894, + "grad_norm": 0.3392043127421207, + "learning_rate": 3.725989807918464e-05, + "loss": 0.5649, + "step": 935 + }, + { + "epoch": 0.9904761904761905, + "grad_norm": 0.46994870334965927, + "learning_rate": 3.724029792238338e-05, + "loss": 0.5019, + "step": 936 + }, + { + "epoch": 0.9915343915343915, + "grad_norm": 0.31670821542256694, + "learning_rate": 3.722069776558212e-05, + "loss": 0.4984, + "step": 937 + }, + { + "epoch": 0.9925925925925926, + "grad_norm": 0.30042583096668124, + "learning_rate": 3.720109760878087e-05, + "loss": 0.443, + "step": 938 + }, + { + "epoch": 0.9936507936507937, + "grad_norm": 0.26554660858783213, + "learning_rate": 3.718149745197962e-05, + "loss": 0.5143, + "step": 939 + }, + { + "epoch": 0.9947089947089947, + "grad_norm": 0.31047007588956205, + "learning_rate": 3.7161897295178364e-05, + "loss": 0.5113, + "step": 940 + }, + { + "epoch": 0.9957671957671957, + "grad_norm": 0.32328394275671823, + "learning_rate": 3.7142297138377106e-05, + "loss": 0.5627, + "step": 941 + }, + { + "epoch": 0.9968253968253968, + "grad_norm": 0.30050781187053605, + "learning_rate": 3.7122696981575856e-05, + "loss": 0.4492, + "step": 942 + }, + { + "epoch": 0.9978835978835979, + "grad_norm": 0.2883856032205587, + "learning_rate": 3.7103096824774605e-05, + "loss": 0.573, + "step": 943 + }, + { + "epoch": 0.9989417989417989, + "grad_norm": 0.3095113640917161, + "learning_rate": 3.708349666797335e-05, + "loss": 0.5471, + "step": 944 + }, + { + "epoch": 1.0, + "grad_norm": 0.2740280000061703, + "learning_rate": 3.706389651117209e-05, + "loss": 0.5048, + "step": 945 + }, + { + "epoch": 1.001058201058201, + "grad_norm": 0.3430281338192382, + "learning_rate": 3.704429635437084e-05, + "loss": 0.484, + "step": 946 + }, + { + "epoch": 1.0021164021164022, + "grad_norm": 0.5765855281602044, + "learning_rate": 3.702469619756958e-05, + "loss": 0.421, + "step": 947 + }, + { + "epoch": 1.0031746031746032, + "grad_norm": 0.32549789453659606, + "learning_rate": 3.700509604076833e-05, + "loss": 0.5094, + "step": 948 + }, + { + "epoch": 1.0042328042328041, + "grad_norm": 0.27167110880361595, + "learning_rate": 3.698549588396707e-05, + "loss": 0.4292, + "step": 949 + }, + { + "epoch": 1.0052910052910053, + "grad_norm": 0.2634410312734115, + "learning_rate": 3.6965895727165816e-05, + "loss": 0.4056, + "step": 950 + }, + { + "epoch": 1.0063492063492063, + "grad_norm": 0.290932789089174, + "learning_rate": 3.6946295570364565e-05, + "loss": 0.4966, + "step": 951 + }, + { + "epoch": 1.0074074074074073, + "grad_norm": 0.277649198156178, + "learning_rate": 3.6926695413563314e-05, + "loss": 0.5468, + "step": 952 + }, + { + "epoch": 1.0084656084656085, + "grad_norm": 0.2871264585009383, + "learning_rate": 3.690709525676205e-05, + "loss": 0.4619, + "step": 953 + }, + { + "epoch": 1.0095238095238095, + "grad_norm": 0.26922960005310304, + "learning_rate": 3.68874950999608e-05, + "loss": 0.4126, + "step": 954 + }, + { + "epoch": 1.0105820105820107, + "grad_norm": 0.2630654741035761, + "learning_rate": 3.686789494315955e-05, + "loss": 0.4967, + "step": 955 + }, + { + "epoch": 1.0116402116402117, + "grad_norm": 0.30760949861741693, + "learning_rate": 3.68482947863583e-05, + "loss": 0.3988, + "step": 956 + }, + { + "epoch": 1.0126984126984127, + "grad_norm": 0.30191598263981867, + "learning_rate": 3.6828694629557034e-05, + "loss": 0.4891, + "step": 957 + }, + { + "epoch": 1.0137566137566139, + "grad_norm": 0.24918234554246604, + "learning_rate": 3.680909447275578e-05, + "loss": 0.4265, + "step": 958 + }, + { + "epoch": 1.0148148148148148, + "grad_norm": 0.25444420109426397, + "learning_rate": 3.678949431595453e-05, + "loss": 0.4755, + "step": 959 + }, + { + "epoch": 1.0158730158730158, + "grad_norm": 0.2704029549763871, + "learning_rate": 3.6769894159153275e-05, + "loss": 0.4622, + "step": 960 + }, + { + "epoch": 1.016931216931217, + "grad_norm": 0.25814705258519083, + "learning_rate": 3.675029400235202e-05, + "loss": 0.45, + "step": 961 + }, + { + "epoch": 1.017989417989418, + "grad_norm": 0.24396615670006316, + "learning_rate": 3.6730693845550766e-05, + "loss": 0.4621, + "step": 962 + }, + { + "epoch": 1.019047619047619, + "grad_norm": 0.24613409465997452, + "learning_rate": 3.671109368874951e-05, + "loss": 0.3997, + "step": 963 + }, + { + "epoch": 1.0201058201058202, + "grad_norm": 0.2446865998835932, + "learning_rate": 3.669149353194826e-05, + "loss": 0.4312, + "step": 964 + }, + { + "epoch": 1.0211640211640212, + "grad_norm": 0.24064127384639672, + "learning_rate": 3.6671893375147e-05, + "loss": 0.4879, + "step": 965 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.24643135413567363, + "learning_rate": 3.665229321834575e-05, + "loss": 0.4811, + "step": 966 + }, + { + "epoch": 1.0232804232804233, + "grad_norm": 0.3002457243261394, + "learning_rate": 3.663269306154449e-05, + "loss": 0.5209, + "step": 967 + }, + { + "epoch": 1.0243386243386243, + "grad_norm": 0.22426207621628652, + "learning_rate": 3.661309290474324e-05, + "loss": 0.4123, + "step": 968 + }, + { + "epoch": 1.0253968253968253, + "grad_norm": 0.24667559321374757, + "learning_rate": 3.659349274794199e-05, + "loss": 0.462, + "step": 969 + }, + { + "epoch": 1.0264550264550265, + "grad_norm": 0.21521332687361075, + "learning_rate": 3.6573892591140727e-05, + "loss": 0.3874, + "step": 970 + }, + { + "epoch": 1.0275132275132275, + "grad_norm": 0.21371142578879992, + "learning_rate": 3.6554292434339476e-05, + "loss": 0.4216, + "step": 971 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.2457752256589608, + "learning_rate": 3.6534692277538225e-05, + "loss": 0.4751, + "step": 972 + }, + { + "epoch": 1.0296296296296297, + "grad_norm": 0.23036724262890812, + "learning_rate": 3.651509212073697e-05, + "loss": 0.4582, + "step": 973 + }, + { + "epoch": 1.0306878306878307, + "grad_norm": 0.2398481443361909, + "learning_rate": 3.649549196393571e-05, + "loss": 0.4908, + "step": 974 + }, + { + "epoch": 1.0317460317460316, + "grad_norm": 0.2191458862749879, + "learning_rate": 3.647589180713446e-05, + "loss": 0.3675, + "step": 975 + }, + { + "epoch": 1.0328042328042328, + "grad_norm": 0.24624710147713, + "learning_rate": 3.64562916503332e-05, + "loss": 0.4168, + "step": 976 + }, + { + "epoch": 1.0338624338624338, + "grad_norm": 0.2505135808681229, + "learning_rate": 3.643669149353195e-05, + "loss": 0.4317, + "step": 977 + }, + { + "epoch": 1.034920634920635, + "grad_norm": 0.23347770709457338, + "learning_rate": 3.6417091336730694e-05, + "loss": 0.4385, + "step": 978 + }, + { + "epoch": 1.035978835978836, + "grad_norm": 0.228128885877278, + "learning_rate": 3.639749117992944e-05, + "loss": 0.4258, + "step": 979 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 0.2629471686081972, + "learning_rate": 3.6377891023128185e-05, + "loss": 0.4682, + "step": 980 + }, + { + "epoch": 1.0380952380952382, + "grad_norm": 0.2649161639864007, + "learning_rate": 3.6358290866326935e-05, + "loss": 0.4061, + "step": 981 + }, + { + "epoch": 1.0391534391534392, + "grad_norm": 0.23269995847151623, + "learning_rate": 3.633869070952568e-05, + "loss": 0.3948, + "step": 982 + }, + { + "epoch": 1.0402116402116401, + "grad_norm": 0.3069159899042451, + "learning_rate": 3.631909055272442e-05, + "loss": 0.4964, + "step": 983 + }, + { + "epoch": 1.0412698412698413, + "grad_norm": 0.28329260505792503, + "learning_rate": 3.629949039592317e-05, + "loss": 0.446, + "step": 984 + }, + { + "epoch": 1.0423280423280423, + "grad_norm": 0.22085634036978702, + "learning_rate": 3.627989023912192e-05, + "loss": 0.397, + "step": 985 + }, + { + "epoch": 1.0433862433862433, + "grad_norm": 0.2533209615376046, + "learning_rate": 3.626029008232066e-05, + "loss": 0.4478, + "step": 986 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.24209341289293657, + "learning_rate": 3.62406899255194e-05, + "loss": 0.4514, + "step": 987 + }, + { + "epoch": 1.0455026455026455, + "grad_norm": 0.22467546448533063, + "learning_rate": 3.622108976871815e-05, + "loss": 0.4335, + "step": 988 + }, + { + "epoch": 1.0465608465608465, + "grad_norm": 0.21049472574558023, + "learning_rate": 3.6201489611916895e-05, + "loss": 0.408, + "step": 989 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 0.27894693629024636, + "learning_rate": 3.6181889455115644e-05, + "loss": 0.4497, + "step": 990 + }, + { + "epoch": 1.0486772486772487, + "grad_norm": 0.2569069796077995, + "learning_rate": 3.6162289298314387e-05, + "loss": 0.4601, + "step": 991 + }, + { + "epoch": 1.0497354497354496, + "grad_norm": 0.2284829852995991, + "learning_rate": 3.6142689141513136e-05, + "loss": 0.3953, + "step": 992 + }, + { + "epoch": 1.0507936507936508, + "grad_norm": 0.29209053034281596, + "learning_rate": 3.612308898471188e-05, + "loss": 0.5659, + "step": 993 + }, + { + "epoch": 1.0518518518518518, + "grad_norm": 0.3674972112779229, + "learning_rate": 3.610348882791063e-05, + "loss": 0.5056, + "step": 994 + }, + { + "epoch": 1.052910052910053, + "grad_norm": 0.24428585787867374, + "learning_rate": 3.608388867110937e-05, + "loss": 0.4746, + "step": 995 + }, + { + "epoch": 1.053968253968254, + "grad_norm": 0.24244488023201471, + "learning_rate": 3.606428851430811e-05, + "loss": 0.4863, + "step": 996 + }, + { + "epoch": 1.055026455026455, + "grad_norm": 0.2261470911889456, + "learning_rate": 3.604468835750686e-05, + "loss": 0.449, + "step": 997 + }, + { + "epoch": 1.0560846560846562, + "grad_norm": 0.23100103297173594, + "learning_rate": 3.602508820070561e-05, + "loss": 0.4325, + "step": 998 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 0.2383765185611898, + "learning_rate": 3.6005488043904354e-05, + "loss": 0.4625, + "step": 999 + }, + { + "epoch": 1.0582010582010581, + "grad_norm": 0.22794426514581065, + "learning_rate": 3.5985887887103096e-05, + "loss": 0.4055, + "step": 1000 + }, + { + "epoch": 1.0592592592592593, + "grad_norm": 0.24878598922435188, + "learning_rate": 3.5966287730301845e-05, + "loss": 0.4176, + "step": 1001 + }, + { + "epoch": 1.0603174603174603, + "grad_norm": 0.24946112914172103, + "learning_rate": 3.594668757350059e-05, + "loss": 0.4949, + "step": 1002 + }, + { + "epoch": 1.0613756613756613, + "grad_norm": 0.24129845809349035, + "learning_rate": 3.592708741669934e-05, + "loss": 0.3912, + "step": 1003 + }, + { + "epoch": 1.0624338624338625, + "grad_norm": 0.24281750665894428, + "learning_rate": 3.590748725989808e-05, + "loss": 0.4568, + "step": 1004 + }, + { + "epoch": 1.0634920634920635, + "grad_norm": 0.20760278598384954, + "learning_rate": 3.588788710309683e-05, + "loss": 0.3948, + "step": 1005 + }, + { + "epoch": 1.0645502645502645, + "grad_norm": 0.23431721866007746, + "learning_rate": 3.586828694629557e-05, + "loss": 0.424, + "step": 1006 + }, + { + "epoch": 1.0656084656084657, + "grad_norm": 0.2471701897811236, + "learning_rate": 3.584868678949432e-05, + "loss": 0.4256, + "step": 1007 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.2302012676934357, + "learning_rate": 3.582908663269306e-05, + "loss": 0.4277, + "step": 1008 + }, + { + "epoch": 1.0677248677248676, + "grad_norm": 0.22452613295126128, + "learning_rate": 3.5809486475891806e-05, + "loss": 0.4465, + "step": 1009 + }, + { + "epoch": 1.0687830687830688, + "grad_norm": 0.2257507725040729, + "learning_rate": 3.5789886319090555e-05, + "loss": 0.4698, + "step": 1010 + }, + { + "epoch": 1.0698412698412698, + "grad_norm": 0.24453737754436786, + "learning_rate": 3.5770286162289304e-05, + "loss": 0.4469, + "step": 1011 + }, + { + "epoch": 1.0708994708994708, + "grad_norm": 0.24956877063093863, + "learning_rate": 3.5750686005488047e-05, + "loss": 0.4626, + "step": 1012 + }, + { + "epoch": 1.071957671957672, + "grad_norm": 0.2600831587111751, + "learning_rate": 3.573108584868679e-05, + "loss": 0.4537, + "step": 1013 + }, + { + "epoch": 1.073015873015873, + "grad_norm": 4.587346913530301, + "learning_rate": 3.571148569188554e-05, + "loss": 0.558, + "step": 1014 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 0.29359742422528, + "learning_rate": 3.569188553508429e-05, + "loss": 0.4682, + "step": 1015 + }, + { + "epoch": 1.0751322751322752, + "grad_norm": 0.31833732588739005, + "learning_rate": 3.567228537828303e-05, + "loss": 0.4743, + "step": 1016 + }, + { + "epoch": 1.0761904761904761, + "grad_norm": 0.2553867628775377, + "learning_rate": 3.565268522148177e-05, + "loss": 0.551, + "step": 1017 + }, + { + "epoch": 1.0772486772486773, + "grad_norm": 0.24879754575790247, + "learning_rate": 3.563308506468052e-05, + "loss": 0.4524, + "step": 1018 + }, + { + "epoch": 1.0783068783068783, + "grad_norm": 0.28766655345957765, + "learning_rate": 3.5613484907879264e-05, + "loss": 0.4384, + "step": 1019 + }, + { + "epoch": 1.0793650793650793, + "grad_norm": 0.41507355236002946, + "learning_rate": 3.5593884751078014e-05, + "loss": 0.4206, + "step": 1020 + }, + { + "epoch": 1.0804232804232805, + "grad_norm": 0.23507148230554162, + "learning_rate": 3.5574284594276756e-05, + "loss": 0.4353, + "step": 1021 + }, + { + "epoch": 1.0814814814814815, + "grad_norm": 0.2622046524529905, + "learning_rate": 3.55546844374755e-05, + "loss": 0.487, + "step": 1022 + }, + { + "epoch": 1.0825396825396825, + "grad_norm": 0.2963869501406697, + "learning_rate": 3.553508428067425e-05, + "loss": 0.5234, + "step": 1023 + }, + { + "epoch": 1.0835978835978837, + "grad_norm": 0.22883220209770097, + "learning_rate": 3.5515484123873e-05, + "loss": 0.4493, + "step": 1024 + }, + { + "epoch": 1.0846560846560847, + "grad_norm": 0.2569611944727975, + "learning_rate": 3.549588396707173e-05, + "loss": 0.466, + "step": 1025 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 0.254023601263868, + "learning_rate": 3.547628381027048e-05, + "loss": 0.459, + "step": 1026 + }, + { + "epoch": 1.0867724867724868, + "grad_norm": 0.23407024600575801, + "learning_rate": 3.545668365346923e-05, + "loss": 0.45, + "step": 1027 + }, + { + "epoch": 1.0878306878306878, + "grad_norm": 0.2599527517255244, + "learning_rate": 3.543708349666798e-05, + "loss": 0.4115, + "step": 1028 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.22311611567918718, + "learning_rate": 3.5417483339866716e-05, + "loss": 0.3979, + "step": 1029 + }, + { + "epoch": 1.08994708994709, + "grad_norm": 0.2299712856828212, + "learning_rate": 3.5397883183065466e-05, + "loss": 0.3694, + "step": 1030 + }, + { + "epoch": 1.091005291005291, + "grad_norm": 0.21532417571199142, + "learning_rate": 3.5378283026264215e-05, + "loss": 0.3551, + "step": 1031 + }, + { + "epoch": 1.0920634920634922, + "grad_norm": 0.23843932815464314, + "learning_rate": 3.535868286946296e-05, + "loss": 0.4513, + "step": 1032 + }, + { + "epoch": 1.0931216931216932, + "grad_norm": 0.21531850573097505, + "learning_rate": 3.53390827126617e-05, + "loss": 0.3931, + "step": 1033 + }, + { + "epoch": 1.0941798941798941, + "grad_norm": 0.3068198759456981, + "learning_rate": 3.531948255586045e-05, + "loss": 0.4891, + "step": 1034 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.23836601091918505, + "learning_rate": 3.529988239905919e-05, + "loss": 0.439, + "step": 1035 + }, + { + "epoch": 1.0962962962962963, + "grad_norm": 0.2437498779338309, + "learning_rate": 3.528028224225794e-05, + "loss": 0.4149, + "step": 1036 + }, + { + "epoch": 1.0973544973544973, + "grad_norm": 0.27351894671972754, + "learning_rate": 3.526068208545668e-05, + "loss": 0.4416, + "step": 1037 + }, + { + "epoch": 1.0984126984126985, + "grad_norm": 0.27770230078648367, + "learning_rate": 3.5241081928655426e-05, + "loss": 0.4884, + "step": 1038 + }, + { + "epoch": 1.0994708994708995, + "grad_norm": 0.25718315589569163, + "learning_rate": 3.5221481771854175e-05, + "loss": 0.4578, + "step": 1039 + }, + { + "epoch": 1.1005291005291005, + "grad_norm": 0.2594326299087, + "learning_rate": 3.5201881615052924e-05, + "loss": 0.4864, + "step": 1040 + }, + { + "epoch": 1.1015873015873017, + "grad_norm": 0.2472630187462841, + "learning_rate": 3.5182281458251674e-05, + "loss": 0.4451, + "step": 1041 + }, + { + "epoch": 1.1026455026455027, + "grad_norm": 0.25781560522935376, + "learning_rate": 3.516268130145041e-05, + "loss": 0.4182, + "step": 1042 + }, + { + "epoch": 1.1037037037037036, + "grad_norm": 0.2535547232578919, + "learning_rate": 3.514308114464916e-05, + "loss": 0.4749, + "step": 1043 + }, + { + "epoch": 1.1047619047619048, + "grad_norm": 0.254021416442326, + "learning_rate": 3.512348098784791e-05, + "loss": 0.471, + "step": 1044 + }, + { + "epoch": 1.1058201058201058, + "grad_norm": 0.28102597311344235, + "learning_rate": 3.510388083104665e-05, + "loss": 0.4615, + "step": 1045 + }, + { + "epoch": 1.1068783068783068, + "grad_norm": 0.23113213821636622, + "learning_rate": 3.508428067424539e-05, + "loss": 0.4147, + "step": 1046 + }, + { + "epoch": 1.107936507936508, + "grad_norm": 0.2474923745981431, + "learning_rate": 3.506468051744414e-05, + "loss": 0.4437, + "step": 1047 + }, + { + "epoch": 1.108994708994709, + "grad_norm": 0.24830934850829475, + "learning_rate": 3.5045080360642884e-05, + "loss": 0.4376, + "step": 1048 + }, + { + "epoch": 1.11005291005291, + "grad_norm": 0.47806726597633753, + "learning_rate": 3.5025480203841634e-05, + "loss": 0.4324, + "step": 1049 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.32025751473392294, + "learning_rate": 3.5005880047040376e-05, + "loss": 0.5246, + "step": 1050 + }, + { + "epoch": 1.1121693121693121, + "grad_norm": 0.28097198860260764, + "learning_rate": 3.498627989023912e-05, + "loss": 0.4598, + "step": 1051 + }, + { + "epoch": 1.1132275132275131, + "grad_norm": 0.254947622562914, + "learning_rate": 3.496667973343787e-05, + "loss": 0.4222, + "step": 1052 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.3349593604089576, + "learning_rate": 3.494707957663662e-05, + "loss": 0.4155, + "step": 1053 + }, + { + "epoch": 1.1153439153439153, + "grad_norm": 0.26874038889591473, + "learning_rate": 3.492747941983536e-05, + "loss": 0.4375, + "step": 1054 + }, + { + "epoch": 1.1164021164021163, + "grad_norm": 0.26633236590448656, + "learning_rate": 3.49078792630341e-05, + "loss": 0.4267, + "step": 1055 + }, + { + "epoch": 1.1174603174603175, + "grad_norm": 0.2884717153428804, + "learning_rate": 3.488827910623285e-05, + "loss": 0.4257, + "step": 1056 + }, + { + "epoch": 1.1185185185185185, + "grad_norm": 0.4452379521735504, + "learning_rate": 3.48686789494316e-05, + "loss": 0.4332, + "step": 1057 + }, + { + "epoch": 1.1195767195767197, + "grad_norm": 0.7702289596361585, + "learning_rate": 3.484907879263034e-05, + "loss": 0.4349, + "step": 1058 + }, + { + "epoch": 1.1206349206349207, + "grad_norm": 0.3656397372057208, + "learning_rate": 3.4829478635829086e-05, + "loss": 0.4238, + "step": 1059 + }, + { + "epoch": 1.1216931216931216, + "grad_norm": 0.4866179217310575, + "learning_rate": 3.4809878479027835e-05, + "loss": 0.4216, + "step": 1060 + }, + { + "epoch": 1.1227513227513228, + "grad_norm": 0.33474959074863353, + "learning_rate": 3.479027832222658e-05, + "loss": 0.4418, + "step": 1061 + }, + { + "epoch": 1.1238095238095238, + "grad_norm": 0.27607814779984247, + "learning_rate": 3.477067816542533e-05, + "loss": 0.4364, + "step": 1062 + }, + { + "epoch": 1.1248677248677248, + "grad_norm": 0.3407943035819788, + "learning_rate": 3.475107800862407e-05, + "loss": 0.4665, + "step": 1063 + }, + { + "epoch": 1.125925925925926, + "grad_norm": 0.33884649086749913, + "learning_rate": 3.473147785182282e-05, + "loss": 0.4543, + "step": 1064 + }, + { + "epoch": 1.126984126984127, + "grad_norm": 0.29045801993635223, + "learning_rate": 3.471187769502156e-05, + "loss": 0.4612, + "step": 1065 + }, + { + "epoch": 1.128042328042328, + "grad_norm": 0.3424125515301145, + "learning_rate": 3.469227753822031e-05, + "loss": 0.5199, + "step": 1066 + }, + { + "epoch": 1.1291005291005292, + "grad_norm": 0.29631324600571024, + "learning_rate": 3.467267738141905e-05, + "loss": 0.4505, + "step": 1067 + }, + { + "epoch": 1.1301587301587301, + "grad_norm": 0.2254979652739058, + "learning_rate": 3.4653077224617795e-05, + "loss": 0.4902, + "step": 1068 + }, + { + "epoch": 1.1312169312169311, + "grad_norm": 0.29453491815047816, + "learning_rate": 3.4633477067816544e-05, + "loss": 0.4537, + "step": 1069 + }, + { + "epoch": 1.1322751322751323, + "grad_norm": 0.24120272460563255, + "learning_rate": 3.4613876911015294e-05, + "loss": 0.4156, + "step": 1070 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.2417667303754181, + "learning_rate": 3.4594276754214036e-05, + "loss": 0.464, + "step": 1071 + }, + { + "epoch": 1.1343915343915345, + "grad_norm": 0.24479296740003978, + "learning_rate": 3.457467659741278e-05, + "loss": 0.425, + "step": 1072 + }, + { + "epoch": 1.1354497354497355, + "grad_norm": 0.2606469824823141, + "learning_rate": 3.455507644061153e-05, + "loss": 0.4171, + "step": 1073 + }, + { + "epoch": 1.1365079365079365, + "grad_norm": 1.160342854664124, + "learning_rate": 3.453547628381027e-05, + "loss": 0.4907, + "step": 1074 + }, + { + "epoch": 1.1375661375661377, + "grad_norm": 0.27475204211746457, + "learning_rate": 3.451587612700902e-05, + "loss": 0.4412, + "step": 1075 + }, + { + "epoch": 1.1386243386243386, + "grad_norm": 0.27882185123707176, + "learning_rate": 3.449627597020776e-05, + "loss": 0.4304, + "step": 1076 + }, + { + "epoch": 1.1396825396825396, + "grad_norm": 0.24769703151282585, + "learning_rate": 3.447667581340651e-05, + "loss": 0.4258, + "step": 1077 + }, + { + "epoch": 1.1407407407407408, + "grad_norm": 0.2624061999879434, + "learning_rate": 3.4457075656605254e-05, + "loss": 0.4202, + "step": 1078 + }, + { + "epoch": 1.1417989417989418, + "grad_norm": 0.23516866564878172, + "learning_rate": 3.4437475499804e-05, + "loss": 0.4201, + "step": 1079 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.2591076428810598, + "learning_rate": 3.4417875343002746e-05, + "loss": 0.4179, + "step": 1080 + }, + { + "epoch": 1.143915343915344, + "grad_norm": 0.245392393657956, + "learning_rate": 3.439827518620149e-05, + "loss": 0.3815, + "step": 1081 + }, + { + "epoch": 1.144973544973545, + "grad_norm": 0.23935170354064206, + "learning_rate": 3.437867502940024e-05, + "loss": 0.4652, + "step": 1082 + }, + { + "epoch": 1.146031746031746, + "grad_norm": 0.24157446804740304, + "learning_rate": 3.435907487259899e-05, + "loss": 0.4123, + "step": 1083 + }, + { + "epoch": 1.1470899470899472, + "grad_norm": 0.2997924271290018, + "learning_rate": 3.433947471579773e-05, + "loss": 0.4711, + "step": 1084 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.2513216731368017, + "learning_rate": 3.431987455899647e-05, + "loss": 0.4065, + "step": 1085 + }, + { + "epoch": 1.1492063492063491, + "grad_norm": 0.2522352354653566, + "learning_rate": 3.430027440219522e-05, + "loss": 0.4234, + "step": 1086 + }, + { + "epoch": 1.1502645502645503, + "grad_norm": 0.29099731437781645, + "learning_rate": 3.4280674245393963e-05, + "loss": 0.4617, + "step": 1087 + }, + { + "epoch": 1.1513227513227513, + "grad_norm": 0.2940806275764955, + "learning_rate": 3.426107408859271e-05, + "loss": 0.4231, + "step": 1088 + }, + { + "epoch": 1.1523809523809523, + "grad_norm": 0.27282227795834785, + "learning_rate": 3.4241473931791455e-05, + "loss": 0.4884, + "step": 1089 + }, + { + "epoch": 1.1534391534391535, + "grad_norm": 0.25357226775494324, + "learning_rate": 3.4221873774990204e-05, + "loss": 0.4817, + "step": 1090 + }, + { + "epoch": 1.1544973544973545, + "grad_norm": 0.29654179010898835, + "learning_rate": 3.420227361818895e-05, + "loss": 0.4057, + "step": 1091 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.27272067824064034, + "learning_rate": 3.4182673461387696e-05, + "loss": 0.4649, + "step": 1092 + }, + { + "epoch": 1.1566137566137566, + "grad_norm": 0.25231760876479825, + "learning_rate": 3.416307330458644e-05, + "loss": 0.4467, + "step": 1093 + }, + { + "epoch": 1.1576719576719576, + "grad_norm": 1.4439868836946126, + "learning_rate": 3.414347314778518e-05, + "loss": 0.5031, + "step": 1094 + }, + { + "epoch": 1.1587301587301586, + "grad_norm": 0.2669246549318654, + "learning_rate": 3.412387299098393e-05, + "loss": 0.4242, + "step": 1095 + }, + { + "epoch": 1.1597883597883598, + "grad_norm": 0.4725447349327677, + "learning_rate": 3.410427283418268e-05, + "loss": 0.4037, + "step": 1096 + }, + { + "epoch": 1.1608465608465608, + "grad_norm": 0.24176894320740916, + "learning_rate": 3.4084672677381415e-05, + "loss": 0.4461, + "step": 1097 + }, + { + "epoch": 1.161904761904762, + "grad_norm": 0.2571630439924496, + "learning_rate": 3.4065072520580165e-05, + "loss": 0.4435, + "step": 1098 + }, + { + "epoch": 1.162962962962963, + "grad_norm": 0.25405311821960447, + "learning_rate": 3.4045472363778914e-05, + "loss": 0.4008, + "step": 1099 + }, + { + "epoch": 1.164021164021164, + "grad_norm": 0.2360510636639726, + "learning_rate": 3.4025872206977656e-05, + "loss": 0.4418, + "step": 1100 + }, + { + "epoch": 1.1650793650793652, + "grad_norm": 0.24349573097824267, + "learning_rate": 3.40062720501764e-05, + "loss": 0.4242, + "step": 1101 + }, + { + "epoch": 1.1661375661375661, + "grad_norm": 0.22205185004409997, + "learning_rate": 3.398667189337515e-05, + "loss": 0.4319, + "step": 1102 + }, + { + "epoch": 1.1671957671957671, + "grad_norm": 0.25476677264568914, + "learning_rate": 3.39670717365739e-05, + "loss": 0.4214, + "step": 1103 + }, + { + "epoch": 1.1682539682539683, + "grad_norm": 0.24140105278129537, + "learning_rate": 3.394747157977264e-05, + "loss": 0.4331, + "step": 1104 + }, + { + "epoch": 1.1693121693121693, + "grad_norm": 0.23526559056529114, + "learning_rate": 3.392787142297138e-05, + "loss": 0.4854, + "step": 1105 + }, + { + "epoch": 1.1703703703703703, + "grad_norm": 3.2639568571243096, + "learning_rate": 3.390827126617013e-05, + "loss": 0.444, + "step": 1106 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 0.29307877776027735, + "learning_rate": 3.3888671109368874e-05, + "loss": 0.491, + "step": 1107 + }, + { + "epoch": 1.1724867724867725, + "grad_norm": 0.25173250808327663, + "learning_rate": 3.3869070952567623e-05, + "loss": 0.4304, + "step": 1108 + }, + { + "epoch": 1.1735449735449737, + "grad_norm": 0.23834729259967638, + "learning_rate": 3.3849470795766366e-05, + "loss": 0.4242, + "step": 1109 + }, + { + "epoch": 1.1746031746031746, + "grad_norm": 0.2820981143799431, + "learning_rate": 3.382987063896511e-05, + "loss": 0.4531, + "step": 1110 + }, + { + "epoch": 1.1756613756613756, + "grad_norm": 0.2684834031217499, + "learning_rate": 3.381027048216386e-05, + "loss": 0.4423, + "step": 1111 + }, + { + "epoch": 1.1767195767195768, + "grad_norm": 0.22060471509608026, + "learning_rate": 3.379067032536261e-05, + "loss": 0.4567, + "step": 1112 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.2938768585311419, + "learning_rate": 3.3771070168561356e-05, + "loss": 0.5653, + "step": 1113 + }, + { + "epoch": 1.1788359788359788, + "grad_norm": 0.2185303188886883, + "learning_rate": 3.375147001176009e-05, + "loss": 0.4376, + "step": 1114 + }, + { + "epoch": 1.17989417989418, + "grad_norm": 0.31115860640653836, + "learning_rate": 3.373186985495884e-05, + "loss": 0.477, + "step": 1115 + }, + { + "epoch": 1.180952380952381, + "grad_norm": 0.2546303170293118, + "learning_rate": 3.371226969815759e-05, + "loss": 0.4421, + "step": 1116 + }, + { + "epoch": 1.182010582010582, + "grad_norm": 0.24401462771526597, + "learning_rate": 3.369266954135633e-05, + "loss": 0.4158, + "step": 1117 + }, + { + "epoch": 1.1830687830687832, + "grad_norm": 0.2371965772098529, + "learning_rate": 3.3673069384555075e-05, + "loss": 0.4148, + "step": 1118 + }, + { + "epoch": 1.1841269841269841, + "grad_norm": 0.25311128441059516, + "learning_rate": 3.3653469227753825e-05, + "loss": 0.3996, + "step": 1119 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 0.25406846795302634, + "learning_rate": 3.363386907095257e-05, + "loss": 0.4802, + "step": 1120 + }, + { + "epoch": 1.1862433862433863, + "grad_norm": 0.28969576250269813, + "learning_rate": 3.3614268914151316e-05, + "loss": 0.4434, + "step": 1121 + }, + { + "epoch": 1.1873015873015873, + "grad_norm": 0.25113888422786323, + "learning_rate": 3.359466875735006e-05, + "loss": 0.4693, + "step": 1122 + }, + { + "epoch": 1.1883597883597883, + "grad_norm": 0.26242305511278513, + "learning_rate": 3.35750686005488e-05, + "loss": 0.4408, + "step": 1123 + }, + { + "epoch": 1.1894179894179895, + "grad_norm": 0.24851482302114736, + "learning_rate": 3.355546844374755e-05, + "loss": 0.4855, + "step": 1124 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.21264747963193062, + "learning_rate": 3.35358682869463e-05, + "loss": 0.403, + "step": 1125 + }, + { + "epoch": 1.1915343915343914, + "grad_norm": 0.22264008089350337, + "learning_rate": 3.351626813014504e-05, + "loss": 0.4466, + "step": 1126 + }, + { + "epoch": 1.1925925925925926, + "grad_norm": 0.27737325502984783, + "learning_rate": 3.3496667973343785e-05, + "loss": 0.4601, + "step": 1127 + }, + { + "epoch": 1.1936507936507936, + "grad_norm": 3.368075981212778, + "learning_rate": 3.3477067816542534e-05, + "loss": 0.4871, + "step": 1128 + }, + { + "epoch": 1.1947089947089946, + "grad_norm": 0.3148955129252137, + "learning_rate": 3.345746765974128e-05, + "loss": 0.4603, + "step": 1129 + }, + { + "epoch": 1.1957671957671958, + "grad_norm": 0.2533234823076518, + "learning_rate": 3.3437867502940026e-05, + "loss": 0.4263, + "step": 1130 + }, + { + "epoch": 1.1968253968253968, + "grad_norm": 0.25463206395686244, + "learning_rate": 3.341826734613877e-05, + "loss": 0.452, + "step": 1131 + }, + { + "epoch": 1.1978835978835978, + "grad_norm": 0.27879057651843764, + "learning_rate": 3.339866718933752e-05, + "loss": 0.4882, + "step": 1132 + }, + { + "epoch": 1.198941798941799, + "grad_norm": 0.26539022543940605, + "learning_rate": 3.337906703253626e-05, + "loss": 0.4372, + "step": 1133 + }, + { + "epoch": 1.2, + "grad_norm": 0.279151992029459, + "learning_rate": 3.335946687573501e-05, + "loss": 0.4505, + "step": 1134 + }, + { + "epoch": 1.201058201058201, + "grad_norm": 0.3036270580949587, + "learning_rate": 3.333986671893375e-05, + "loss": 0.4974, + "step": 1135 + }, + { + "epoch": 1.2021164021164021, + "grad_norm": 0.27528174866789473, + "learning_rate": 3.3320266562132494e-05, + "loss": 0.4994, + "step": 1136 + }, + { + "epoch": 1.2031746031746031, + "grad_norm": 0.29856950983454983, + "learning_rate": 3.3300666405331244e-05, + "loss": 0.4534, + "step": 1137 + }, + { + "epoch": 1.2042328042328043, + "grad_norm": 0.27683008135091847, + "learning_rate": 3.328106624852999e-05, + "loss": 0.4379, + "step": 1138 + }, + { + "epoch": 1.2052910052910053, + "grad_norm": 0.26301228869162596, + "learning_rate": 3.3261466091728735e-05, + "loss": 0.4369, + "step": 1139 + }, + { + "epoch": 1.2063492063492063, + "grad_norm": 0.4106472014462593, + "learning_rate": 3.324186593492748e-05, + "loss": 0.4226, + "step": 1140 + }, + { + "epoch": 1.2074074074074075, + "grad_norm": 0.2843670121313921, + "learning_rate": 3.322226577812623e-05, + "loss": 0.4895, + "step": 1141 + }, + { + "epoch": 1.2084656084656085, + "grad_norm": 0.31274125765332483, + "learning_rate": 3.3202665621324976e-05, + "loss": 0.5111, + "step": 1142 + }, + { + "epoch": 1.2095238095238094, + "grad_norm": 0.34931031172201393, + "learning_rate": 3.318306546452372e-05, + "loss": 0.4436, + "step": 1143 + }, + { + "epoch": 1.2105820105820106, + "grad_norm": 0.23718925178525013, + "learning_rate": 3.316346530772246e-05, + "loss": 0.3974, + "step": 1144 + }, + { + "epoch": 1.2116402116402116, + "grad_norm": 0.2668299284915015, + "learning_rate": 3.314386515092121e-05, + "loss": 0.4508, + "step": 1145 + }, + { + "epoch": 1.2126984126984126, + "grad_norm": 0.2801312984615186, + "learning_rate": 3.312426499411995e-05, + "loss": 0.4694, + "step": 1146 + }, + { + "epoch": 1.2137566137566138, + "grad_norm": 0.24312099943011134, + "learning_rate": 3.31046648373187e-05, + "loss": 0.4546, + "step": 1147 + }, + { + "epoch": 1.2148148148148148, + "grad_norm": 0.25424837283402324, + "learning_rate": 3.3085064680517445e-05, + "loss": 0.4747, + "step": 1148 + }, + { + "epoch": 1.215873015873016, + "grad_norm": 0.2674867566361891, + "learning_rate": 3.306546452371619e-05, + "loss": 0.4574, + "step": 1149 + }, + { + "epoch": 1.216931216931217, + "grad_norm": 0.2627651556600422, + "learning_rate": 3.3045864366914937e-05, + "loss": 0.4378, + "step": 1150 + }, + { + "epoch": 1.217989417989418, + "grad_norm": 0.23716263801602822, + "learning_rate": 3.3026264210113686e-05, + "loss": 0.4161, + "step": 1151 + }, + { + "epoch": 1.2190476190476192, + "grad_norm": 0.3138567642984657, + "learning_rate": 3.300666405331243e-05, + "loss": 0.4383, + "step": 1152 + }, + { + "epoch": 1.2201058201058201, + "grad_norm": 0.276967866343535, + "learning_rate": 3.298706389651117e-05, + "loss": 0.4737, + "step": 1153 + }, + { + "epoch": 1.2211640211640211, + "grad_norm": 0.2607801018967495, + "learning_rate": 3.296746373970992e-05, + "loss": 0.4962, + "step": 1154 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.24637415714449645, + "learning_rate": 3.294786358290867e-05, + "loss": 0.4259, + "step": 1155 + }, + { + "epoch": 1.2232804232804233, + "grad_norm": 0.25635788195924125, + "learning_rate": 3.292826342610741e-05, + "loss": 0.4851, + "step": 1156 + }, + { + "epoch": 1.2243386243386243, + "grad_norm": 21.708465964907735, + "learning_rate": 3.2908663269306154e-05, + "loss": 0.5324, + "step": 1157 + }, + { + "epoch": 1.2253968253968255, + "grad_norm": 0.2480738276020431, + "learning_rate": 3.2889063112504904e-05, + "loss": 0.4119, + "step": 1158 + }, + { + "epoch": 1.2264550264550265, + "grad_norm": 0.2781677513733888, + "learning_rate": 3.2869462955703646e-05, + "loss": 0.5124, + "step": 1159 + }, + { + "epoch": 1.2275132275132274, + "grad_norm": 0.20926116289505092, + "learning_rate": 3.2849862798902395e-05, + "loss": 0.3981, + "step": 1160 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 0.23499136854110111, + "learning_rate": 3.283026264210114e-05, + "loss": 0.4563, + "step": 1161 + }, + { + "epoch": 1.2296296296296296, + "grad_norm": 0.28418800988255766, + "learning_rate": 3.281066248529989e-05, + "loss": 0.4683, + "step": 1162 + }, + { + "epoch": 1.2306878306878306, + "grad_norm": 0.5732237050386909, + "learning_rate": 3.279106232849863e-05, + "loss": 0.454, + "step": 1163 + }, + { + "epoch": 1.2317460317460318, + "grad_norm": 0.29349759306686635, + "learning_rate": 3.277146217169738e-05, + "loss": 0.3962, + "step": 1164 + }, + { + "epoch": 1.2328042328042328, + "grad_norm": 0.3073242580524234, + "learning_rate": 3.275186201489612e-05, + "loss": 0.4743, + "step": 1165 + }, + { + "epoch": 1.2338624338624338, + "grad_norm": 0.2695714862968926, + "learning_rate": 3.2732261858094864e-05, + "loss": 0.4684, + "step": 1166 + }, + { + "epoch": 1.234920634920635, + "grad_norm": 0.24526233061405303, + "learning_rate": 3.271266170129361e-05, + "loss": 0.4098, + "step": 1167 + }, + { + "epoch": 1.235978835978836, + "grad_norm": 0.282480189507544, + "learning_rate": 3.269306154449236e-05, + "loss": 0.4649, + "step": 1168 + }, + { + "epoch": 1.237037037037037, + "grad_norm": 0.28035139011097865, + "learning_rate": 3.26734613876911e-05, + "loss": 0.5027, + "step": 1169 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.2291595625165518, + "learning_rate": 3.265386123088985e-05, + "loss": 0.3896, + "step": 1170 + }, + { + "epoch": 1.2391534391534391, + "grad_norm": 0.2566644949612776, + "learning_rate": 3.2634261074088597e-05, + "loss": 0.4192, + "step": 1171 + }, + { + "epoch": 1.24021164021164, + "grad_norm": 0.24470481325359464, + "learning_rate": 3.261466091728734e-05, + "loss": 0.4963, + "step": 1172 + }, + { + "epoch": 1.2412698412698413, + "grad_norm": 0.24563820953102872, + "learning_rate": 3.259506076048608e-05, + "loss": 0.4184, + "step": 1173 + }, + { + "epoch": 1.2423280423280423, + "grad_norm": 0.2347354332412375, + "learning_rate": 3.257546060368483e-05, + "loss": 0.4514, + "step": 1174 + }, + { + "epoch": 1.2433862433862433, + "grad_norm": 2.958020037172032, + "learning_rate": 3.255586044688358e-05, + "loss": 0.5267, + "step": 1175 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.2843310735518099, + "learning_rate": 3.253626029008232e-05, + "loss": 0.4906, + "step": 1176 + }, + { + "epoch": 1.2455026455026454, + "grad_norm": 0.2542336692718839, + "learning_rate": 3.2516660133281065e-05, + "loss": 0.4443, + "step": 1177 + }, + { + "epoch": 1.2465608465608466, + "grad_norm": 0.22674151401535145, + "learning_rate": 3.2497059976479814e-05, + "loss": 0.4132, + "step": 1178 + }, + { + "epoch": 1.2476190476190476, + "grad_norm": 0.24724783616086354, + "learning_rate": 3.247745981967856e-05, + "loss": 0.4808, + "step": 1179 + }, + { + "epoch": 1.2486772486772486, + "grad_norm": 0.24184869409515652, + "learning_rate": 3.2457859662877306e-05, + "loss": 0.4494, + "step": 1180 + }, + { + "epoch": 1.2497354497354498, + "grad_norm": 0.23220783704881004, + "learning_rate": 3.243825950607605e-05, + "loss": 0.4236, + "step": 1181 + }, + { + "epoch": 1.2507936507936508, + "grad_norm": 0.21779940633439762, + "learning_rate": 3.241865934927479e-05, + "loss": 0.3891, + "step": 1182 + }, + { + "epoch": 1.2518518518518518, + "grad_norm": 0.22367557629440174, + "learning_rate": 3.239905919247354e-05, + "loss": 0.4615, + "step": 1183 + }, + { + "epoch": 1.252910052910053, + "grad_norm": 0.24819272296895625, + "learning_rate": 3.237945903567229e-05, + "loss": 0.4737, + "step": 1184 + }, + { + "epoch": 1.253968253968254, + "grad_norm": 0.21940462041140096, + "learning_rate": 3.235985887887103e-05, + "loss": 0.4358, + "step": 1185 + }, + { + "epoch": 1.2550264550264552, + "grad_norm": 0.23333157969435228, + "learning_rate": 3.2340258722069775e-05, + "loss": 0.4305, + "step": 1186 + }, + { + "epoch": 1.2560846560846561, + "grad_norm": 0.2878829627468074, + "learning_rate": 3.2320658565268524e-05, + "loss": 0.429, + "step": 1187 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.24749463547096354, + "learning_rate": 3.230105840846727e-05, + "loss": 0.4438, + "step": 1188 + }, + { + "epoch": 1.2582010582010583, + "grad_norm": 0.228540686736987, + "learning_rate": 3.2281458251666016e-05, + "loss": 0.4468, + "step": 1189 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 0.2531138794775126, + "learning_rate": 3.226185809486476e-05, + "loss": 0.4759, + "step": 1190 + }, + { + "epoch": 1.2603174603174603, + "grad_norm": 0.24125935658343106, + "learning_rate": 3.224225793806351e-05, + "loss": 0.4406, + "step": 1191 + }, + { + "epoch": 1.2613756613756615, + "grad_norm": 0.25935447680403384, + "learning_rate": 3.222265778126225e-05, + "loss": 0.4795, + "step": 1192 + }, + { + "epoch": 1.2624338624338625, + "grad_norm": 0.23939043891539874, + "learning_rate": 3.2203057624461e-05, + "loss": 0.4322, + "step": 1193 + }, + { + "epoch": 1.2634920634920634, + "grad_norm": 0.23783030731256977, + "learning_rate": 3.218345746765974e-05, + "loss": 0.4159, + "step": 1194 + }, + { + "epoch": 1.2645502645502646, + "grad_norm": 0.2801793027048429, + "learning_rate": 3.2163857310858484e-05, + "loss": 0.5096, + "step": 1195 + }, + { + "epoch": 1.2656084656084656, + "grad_norm": 0.25672962049215936, + "learning_rate": 3.214425715405723e-05, + "loss": 0.4475, + "step": 1196 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 2.98727547896751, + "learning_rate": 3.212465699725598e-05, + "loss": 0.5605, + "step": 1197 + }, + { + "epoch": 1.2677248677248678, + "grad_norm": 0.31486994948060426, + "learning_rate": 3.2105056840454725e-05, + "loss": 0.5239, + "step": 1198 + }, + { + "epoch": 1.2687830687830688, + "grad_norm": 0.2651388574204949, + "learning_rate": 3.208545668365347e-05, + "loss": 0.4854, + "step": 1199 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.23407941410701735, + "learning_rate": 3.206585652685222e-05, + "loss": 0.4403, + "step": 1200 + }, + { + "epoch": 1.270899470899471, + "grad_norm": 0.25844549625625296, + "learning_rate": 3.2046256370050966e-05, + "loss": 0.4974, + "step": 1201 + }, + { + "epoch": 1.271957671957672, + "grad_norm": 0.357313584767075, + "learning_rate": 3.202665621324971e-05, + "loss": 0.48, + "step": 1202 + }, + { + "epoch": 1.273015873015873, + "grad_norm": 0.2498660944411242, + "learning_rate": 3.200705605644845e-05, + "loss": 0.4524, + "step": 1203 + }, + { + "epoch": 1.2740740740740741, + "grad_norm": 0.2296750952608947, + "learning_rate": 3.19874558996472e-05, + "loss": 0.4473, + "step": 1204 + }, + { + "epoch": 1.2751322751322751, + "grad_norm": 0.23977764063922036, + "learning_rate": 3.196785574284594e-05, + "loss": 0.4306, + "step": 1205 + }, + { + "epoch": 1.276190476190476, + "grad_norm": 0.25984180898265835, + "learning_rate": 3.194825558604469e-05, + "loss": 0.4463, + "step": 1206 + }, + { + "epoch": 1.2772486772486773, + "grad_norm": 0.2205818526717538, + "learning_rate": 3.1928655429243434e-05, + "loss": 0.4162, + "step": 1207 + }, + { + "epoch": 1.2783068783068783, + "grad_norm": 0.20685381181973775, + "learning_rate": 3.190905527244218e-05, + "loss": 0.4067, + "step": 1208 + }, + { + "epoch": 1.2793650793650793, + "grad_norm": 0.2622315576719718, + "learning_rate": 3.1889455115640926e-05, + "loss": 0.4535, + "step": 1209 + }, + { + "epoch": 1.2804232804232805, + "grad_norm": 0.27562631291061185, + "learning_rate": 3.1869854958839675e-05, + "loss": 0.5411, + "step": 1210 + }, + { + "epoch": 1.2814814814814814, + "grad_norm": 0.2436787808592762, + "learning_rate": 3.185025480203842e-05, + "loss": 0.4231, + "step": 1211 + }, + { + "epoch": 1.2825396825396824, + "grad_norm": 0.26452961096327726, + "learning_rate": 3.183065464523716e-05, + "loss": 0.4682, + "step": 1212 + }, + { + "epoch": 1.2835978835978836, + "grad_norm": 0.2561065853683004, + "learning_rate": 3.181105448843591e-05, + "loss": 0.4672, + "step": 1213 + }, + { + "epoch": 1.2846560846560846, + "grad_norm": 0.23228316782055128, + "learning_rate": 3.179145433163466e-05, + "loss": 0.4774, + "step": 1214 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 2.661626366284025, + "learning_rate": 3.17718541748334e-05, + "loss": 0.492, + "step": 1215 + }, + { + "epoch": 1.2867724867724868, + "grad_norm": 0.2550139339573797, + "learning_rate": 3.1752254018032144e-05, + "loss": 0.393, + "step": 1216 + }, + { + "epoch": 1.2878306878306878, + "grad_norm": 0.2982626987372917, + "learning_rate": 3.173265386123089e-05, + "loss": 0.4337, + "step": 1217 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.28057911093701576, + "learning_rate": 3.1713053704429636e-05, + "loss": 0.4806, + "step": 1218 + }, + { + "epoch": 1.28994708994709, + "grad_norm": 0.4561185589253618, + "learning_rate": 3.1693453547628385e-05, + "loss": 0.4365, + "step": 1219 + }, + { + "epoch": 1.291005291005291, + "grad_norm": 0.29263696706005893, + "learning_rate": 3.167385339082713e-05, + "loss": 0.4656, + "step": 1220 + }, + { + "epoch": 1.2920634920634921, + "grad_norm": 0.27093691099606854, + "learning_rate": 3.165425323402587e-05, + "loss": 0.4962, + "step": 1221 + }, + { + "epoch": 1.2931216931216931, + "grad_norm": 0.28064868068280735, + "learning_rate": 3.163465307722462e-05, + "loss": 0.4068, + "step": 1222 + }, + { + "epoch": 1.294179894179894, + "grad_norm": 0.3148596581856991, + "learning_rate": 3.161505292042337e-05, + "loss": 0.4815, + "step": 1223 + }, + { + "epoch": 1.2952380952380953, + "grad_norm": 0.7434992980537427, + "learning_rate": 3.159545276362211e-05, + "loss": 0.5237, + "step": 1224 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.27981974942403515, + "learning_rate": 3.1575852606820853e-05, + "loss": 0.4476, + "step": 1225 + }, + { + "epoch": 1.2973544973544975, + "grad_norm": 0.30917000871173816, + "learning_rate": 3.15562524500196e-05, + "loss": 0.433, + "step": 1226 + }, + { + "epoch": 1.2984126984126985, + "grad_norm": 0.2756768577076242, + "learning_rate": 3.153665229321835e-05, + "loss": 0.4463, + "step": 1227 + }, + { + "epoch": 1.2994708994708994, + "grad_norm": 0.29298489584722304, + "learning_rate": 3.1517052136417094e-05, + "loss": 0.4787, + "step": 1228 + }, + { + "epoch": 1.3005291005291006, + "grad_norm": 0.23814302832789452, + "learning_rate": 3.149745197961584e-05, + "loss": 0.4115, + "step": 1229 + }, + { + "epoch": 1.3015873015873016, + "grad_norm": 0.3031186340550912, + "learning_rate": 3.1477851822814586e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 1.3026455026455026, + "grad_norm": 0.28701409757098467, + "learning_rate": 3.145825166601333e-05, + "loss": 0.4445, + "step": 1231 + }, + { + "epoch": 1.3037037037037038, + "grad_norm": 0.24322652680335896, + "learning_rate": 3.143865150921208e-05, + "loss": 0.4955, + "step": 1232 + }, + { + "epoch": 1.3047619047619048, + "grad_norm": 0.2672959117899414, + "learning_rate": 3.141905135241082e-05, + "loss": 0.495, + "step": 1233 + }, + { + "epoch": 1.3058201058201058, + "grad_norm": 0.25188935994394157, + "learning_rate": 3.139945119560956e-05, + "loss": 0.4609, + "step": 1234 + }, + { + "epoch": 1.306878306878307, + "grad_norm": 0.24955613360702394, + "learning_rate": 3.137985103880831e-05, + "loss": 0.4466, + "step": 1235 + }, + { + "epoch": 1.307936507936508, + "grad_norm": 0.23518682524284248, + "learning_rate": 3.136025088200706e-05, + "loss": 0.3943, + "step": 1236 + }, + { + "epoch": 1.308994708994709, + "grad_norm": 0.23720969598849823, + "learning_rate": 3.1340650725205804e-05, + "loss": 0.5164, + "step": 1237 + }, + { + "epoch": 1.3100529100529101, + "grad_norm": 0.21537376471764513, + "learning_rate": 3.1321050568404546e-05, + "loss": 0.4002, + "step": 1238 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 1.6860157051017535, + "learning_rate": 3.1301450411603296e-05, + "loss": 0.4859, + "step": 1239 + }, + { + "epoch": 1.312169312169312, + "grad_norm": 0.2651741235611922, + "learning_rate": 3.1281850254802045e-05, + "loss": 0.421, + "step": 1240 + }, + { + "epoch": 1.3132275132275133, + "grad_norm": 0.23836910046795176, + "learning_rate": 3.126225009800078e-05, + "loss": 0.4484, + "step": 1241 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.34511017850329395, + "learning_rate": 3.124264994119953e-05, + "loss": 0.4587, + "step": 1242 + }, + { + "epoch": 1.3153439153439153, + "grad_norm": 0.3025522362899887, + "learning_rate": 3.122304978439828e-05, + "loss": 0.4275, + "step": 1243 + }, + { + "epoch": 1.3164021164021165, + "grad_norm": 0.247152508905285, + "learning_rate": 3.120344962759702e-05, + "loss": 0.4214, + "step": 1244 + }, + { + "epoch": 1.3174603174603174, + "grad_norm": 0.25398227298952686, + "learning_rate": 3.1183849470795764e-05, + "loss": 0.4581, + "step": 1245 + }, + { + "epoch": 1.3185185185185184, + "grad_norm": 0.22633308872087934, + "learning_rate": 3.1164249313994513e-05, + "loss": 0.4584, + "step": 1246 + }, + { + "epoch": 1.3195767195767196, + "grad_norm": 0.24210217710444076, + "learning_rate": 3.1144649157193256e-05, + "loss": 0.4206, + "step": 1247 + }, + { + "epoch": 1.3206349206349206, + "grad_norm": 0.23335804933192775, + "learning_rate": 3.1125049000392005e-05, + "loss": 0.4248, + "step": 1248 + }, + { + "epoch": 1.3216931216931216, + "grad_norm": 0.24164931346146493, + "learning_rate": 3.110544884359075e-05, + "loss": 0.4013, + "step": 1249 + }, + { + "epoch": 1.3227513227513228, + "grad_norm": 1.2954500627805814, + "learning_rate": 3.10858486867895e-05, + "loss": 0.5315, + "step": 1250 + }, + { + "epoch": 1.3238095238095238, + "grad_norm": 0.23445766500023593, + "learning_rate": 3.106624852998824e-05, + "loss": 0.4013, + "step": 1251 + }, + { + "epoch": 1.3248677248677247, + "grad_norm": 0.24783454283131148, + "learning_rate": 3.104664837318699e-05, + "loss": 0.4648, + "step": 1252 + }, + { + "epoch": 1.325925925925926, + "grad_norm": 0.2659873598285454, + "learning_rate": 3.102704821638574e-05, + "loss": 0.4992, + "step": 1253 + }, + { + "epoch": 1.326984126984127, + "grad_norm": 0.6888495280810113, + "learning_rate": 3.1007448059584474e-05, + "loss": 0.4971, + "step": 1254 + }, + { + "epoch": 1.328042328042328, + "grad_norm": 0.24722561577784258, + "learning_rate": 3.098784790278322e-05, + "loss": 0.4246, + "step": 1255 + }, + { + "epoch": 1.3291005291005291, + "grad_norm": 0.27176108829651746, + "learning_rate": 3.096824774598197e-05, + "loss": 0.493, + "step": 1256 + }, + { + "epoch": 1.33015873015873, + "grad_norm": 0.2922219870244475, + "learning_rate": 3.0948647589180715e-05, + "loss": 0.4643, + "step": 1257 + }, + { + "epoch": 1.3312169312169313, + "grad_norm": 0.2495354668993547, + "learning_rate": 3.092904743237946e-05, + "loss": 0.4807, + "step": 1258 + }, + { + "epoch": 1.3322751322751323, + "grad_norm": 0.25416175375930267, + "learning_rate": 3.0909447275578206e-05, + "loss": 0.4559, + "step": 1259 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.2505309654320362, + "learning_rate": 3.088984711877695e-05, + "loss": 0.4175, + "step": 1260 + }, + { + "epoch": 1.3343915343915345, + "grad_norm": 0.22168988059391945, + "learning_rate": 3.08702469619757e-05, + "loss": 0.3638, + "step": 1261 + }, + { + "epoch": 1.3354497354497354, + "grad_norm": 0.2088949121467012, + "learning_rate": 3.085064680517444e-05, + "loss": 0.3575, + "step": 1262 + }, + { + "epoch": 1.3365079365079366, + "grad_norm": 0.26660498435295343, + "learning_rate": 3.083104664837319e-05, + "loss": 0.4729, + "step": 1263 + }, + { + "epoch": 1.3375661375661376, + "grad_norm": 0.24658883170058563, + "learning_rate": 3.081144649157193e-05, + "loss": 0.4545, + "step": 1264 + }, + { + "epoch": 1.3386243386243386, + "grad_norm": 0.2236745034029776, + "learning_rate": 3.079184633477068e-05, + "loss": 0.4503, + "step": 1265 + }, + { + "epoch": 1.3396825396825398, + "grad_norm": 0.24135305732779005, + "learning_rate": 3.0772246177969424e-05, + "loss": 0.4874, + "step": 1266 + }, + { + "epoch": 1.3407407407407408, + "grad_norm": 0.3256043618189639, + "learning_rate": 3.075264602116817e-05, + "loss": 0.5231, + "step": 1267 + }, + { + "epoch": 1.3417989417989418, + "grad_norm": 0.23737346236972562, + "learning_rate": 3.0733045864366916e-05, + "loss": 0.4316, + "step": 1268 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.23544854426751335, + "learning_rate": 3.0713445707565665e-05, + "loss": 0.5169, + "step": 1269 + }, + { + "epoch": 1.343915343915344, + "grad_norm": 1.659064568731437, + "learning_rate": 3.069384555076441e-05, + "loss": 0.4348, + "step": 1270 + }, + { + "epoch": 1.344973544973545, + "grad_norm": 0.2534937144399376, + "learning_rate": 3.067424539396315e-05, + "loss": 0.4652, + "step": 1271 + }, + { + "epoch": 1.3460317460317461, + "grad_norm": 0.23526593221138056, + "learning_rate": 3.06546452371619e-05, + "loss": 0.4616, + "step": 1272 + }, + { + "epoch": 1.3470899470899471, + "grad_norm": 0.23742609091689737, + "learning_rate": 3.063504508036065e-05, + "loss": 0.4369, + "step": 1273 + }, + { + "epoch": 1.348148148148148, + "grad_norm": 0.2222447404589058, + "learning_rate": 3.061544492355939e-05, + "loss": 0.4124, + "step": 1274 + }, + { + "epoch": 1.3492063492063493, + "grad_norm": 0.29184250980945264, + "learning_rate": 3.0595844766758134e-05, + "loss": 0.4454, + "step": 1275 + }, + { + "epoch": 1.3502645502645503, + "grad_norm": 0.2349625400452902, + "learning_rate": 3.057624460995688e-05, + "loss": 0.4706, + "step": 1276 + }, + { + "epoch": 1.3513227513227513, + "grad_norm": 0.21431852615283079, + "learning_rate": 3.0556644453155625e-05, + "loss": 0.4258, + "step": 1277 + }, + { + "epoch": 1.3523809523809525, + "grad_norm": 0.21372092688801175, + "learning_rate": 3.0537044296354375e-05, + "loss": 0.4199, + "step": 1278 + }, + { + "epoch": 1.3534391534391534, + "grad_norm": 0.2579456014466511, + "learning_rate": 3.051744413955312e-05, + "loss": 0.4978, + "step": 1279 + }, + { + "epoch": 1.3544973544973544, + "grad_norm": 0.2507941348475173, + "learning_rate": 3.0497843982751863e-05, + "loss": 0.4665, + "step": 1280 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.232129817590424, + "learning_rate": 3.047824382595061e-05, + "loss": 0.456, + "step": 1281 + }, + { + "epoch": 1.3566137566137566, + "grad_norm": 0.2514526079081348, + "learning_rate": 3.0458643669149355e-05, + "loss": 0.4404, + "step": 1282 + }, + { + "epoch": 1.3576719576719576, + "grad_norm": 0.2572184939136445, + "learning_rate": 3.0439043512348097e-05, + "loss": 0.5318, + "step": 1283 + }, + { + "epoch": 1.3587301587301588, + "grad_norm": 0.4039307927131717, + "learning_rate": 3.0419443355546846e-05, + "loss": 0.3901, + "step": 1284 + }, + { + "epoch": 1.3597883597883598, + "grad_norm": 0.24715025836977536, + "learning_rate": 3.0399843198745592e-05, + "loss": 0.4698, + "step": 1285 + }, + { + "epoch": 1.3608465608465607, + "grad_norm": 0.23085610616983096, + "learning_rate": 3.0380243041944338e-05, + "loss": 0.4939, + "step": 1286 + }, + { + "epoch": 1.361904761904762, + "grad_norm": 0.2364958065579408, + "learning_rate": 3.036064288514308e-05, + "loss": 0.4244, + "step": 1287 + }, + { + "epoch": 1.362962962962963, + "grad_norm": 0.25662478506333875, + "learning_rate": 3.034104272834183e-05, + "loss": 0.4692, + "step": 1288 + }, + { + "epoch": 1.364021164021164, + "grad_norm": 0.23895684567679154, + "learning_rate": 3.0321442571540576e-05, + "loss": 0.4296, + "step": 1289 + }, + { + "epoch": 1.3650793650793651, + "grad_norm": 0.2440970628813272, + "learning_rate": 3.030184241473932e-05, + "loss": 0.4288, + "step": 1290 + }, + { + "epoch": 1.366137566137566, + "grad_norm": 0.22375993243823888, + "learning_rate": 3.0282242257938064e-05, + "loss": 0.4409, + "step": 1291 + }, + { + "epoch": 1.367195767195767, + "grad_norm": 0.2213878463198946, + "learning_rate": 3.0262642101136813e-05, + "loss": 0.4122, + "step": 1292 + }, + { + "epoch": 1.3682539682539683, + "grad_norm": 0.23752017382355897, + "learning_rate": 3.0243041944335553e-05, + "loss": 0.4666, + "step": 1293 + }, + { + "epoch": 1.3693121693121693, + "grad_norm": 0.25742210373427815, + "learning_rate": 3.0223441787534302e-05, + "loss": 0.4956, + "step": 1294 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 0.234897224069936, + "learning_rate": 3.0203841630733048e-05, + "loss": 0.4467, + "step": 1295 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.23263694871443225, + "learning_rate": 3.018424147393179e-05, + "loss": 0.4795, + "step": 1296 + }, + { + "epoch": 1.3724867724867724, + "grad_norm": 0.2712687227877605, + "learning_rate": 3.0164641317130536e-05, + "loss": 0.5024, + "step": 1297 + }, + { + "epoch": 1.3735449735449736, + "grad_norm": 0.22372235767515633, + "learning_rate": 3.0145041160329285e-05, + "loss": 0.48, + "step": 1298 + }, + { + "epoch": 1.3746031746031746, + "grad_norm": 0.2406938475308407, + "learning_rate": 3.012544100352803e-05, + "loss": 0.4314, + "step": 1299 + }, + { + "epoch": 1.3756613756613756, + "grad_norm": 0.21389054293693965, + "learning_rate": 3.0105840846726774e-05, + "loss": 0.4229, + "step": 1300 + }, + { + "epoch": 1.3767195767195768, + "grad_norm": 0.23149026826112992, + "learning_rate": 3.008624068992552e-05, + "loss": 0.4707, + "step": 1301 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.2691143078356963, + "learning_rate": 3.006664053312427e-05, + "loss": 0.4282, + "step": 1302 + }, + { + "epoch": 1.378835978835979, + "grad_norm": 0.20755788867950342, + "learning_rate": 3.004704037632301e-05, + "loss": 0.3924, + "step": 1303 + }, + { + "epoch": 1.37989417989418, + "grad_norm": 0.23052600055188638, + "learning_rate": 3.0027440219521757e-05, + "loss": 0.4129, + "step": 1304 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.22668631216324492, + "learning_rate": 3.0007840062720503e-05, + "loss": 0.4708, + "step": 1305 + }, + { + "epoch": 1.3820105820105821, + "grad_norm": 0.2660877251492311, + "learning_rate": 2.9988239905919246e-05, + "loss": 0.5037, + "step": 1306 + }, + { + "epoch": 1.3830687830687831, + "grad_norm": 0.22823512577697444, + "learning_rate": 2.9968639749117995e-05, + "loss": 0.4464, + "step": 1307 + }, + { + "epoch": 1.384126984126984, + "grad_norm": 0.24777055372668572, + "learning_rate": 2.994903959231674e-05, + "loss": 0.5314, + "step": 1308 + }, + { + "epoch": 1.3851851851851853, + "grad_norm": 0.23660719349812223, + "learning_rate": 2.9929439435515483e-05, + "loss": 0.4372, + "step": 1309 + }, + { + "epoch": 1.3862433862433863, + "grad_norm": 0.25379316726262235, + "learning_rate": 2.990983927871423e-05, + "loss": 0.4353, + "step": 1310 + }, + { + "epoch": 1.3873015873015873, + "grad_norm": 0.25004667495021365, + "learning_rate": 2.989023912191298e-05, + "loss": 0.4608, + "step": 1311 + }, + { + "epoch": 1.3883597883597885, + "grad_norm": 0.22311146023995984, + "learning_rate": 2.9870638965111724e-05, + "loss": 0.4398, + "step": 1312 + }, + { + "epoch": 1.3894179894179894, + "grad_norm": 0.2557529215124851, + "learning_rate": 2.9851038808310467e-05, + "loss": 0.5005, + "step": 1313 + }, + { + "epoch": 1.3904761904761904, + "grad_norm": 0.219279598189162, + "learning_rate": 2.9831438651509213e-05, + "loss": 0.4129, + "step": 1314 + }, + { + "epoch": 1.3915343915343916, + "grad_norm": 0.2269485021477995, + "learning_rate": 2.9811838494707962e-05, + "loss": 0.4145, + "step": 1315 + }, + { + "epoch": 1.3925925925925926, + "grad_norm": 0.2331786311014036, + "learning_rate": 2.9792238337906704e-05, + "loss": 0.42, + "step": 1316 + }, + { + "epoch": 1.3936507936507936, + "grad_norm": 0.31888841203624635, + "learning_rate": 2.977263818110545e-05, + "loss": 0.4383, + "step": 1317 + }, + { + "epoch": 1.3947089947089948, + "grad_norm": 0.212841055960245, + "learning_rate": 2.9753038024304196e-05, + "loss": 0.4061, + "step": 1318 + }, + { + "epoch": 1.3957671957671958, + "grad_norm": 0.24247705953739102, + "learning_rate": 2.973343786750294e-05, + "loss": 0.4658, + "step": 1319 + }, + { + "epoch": 1.3968253968253967, + "grad_norm": 0.30703930058231754, + "learning_rate": 2.9713837710701688e-05, + "loss": 0.3601, + "step": 1320 + }, + { + "epoch": 1.397883597883598, + "grad_norm": 0.23472988733850123, + "learning_rate": 2.9694237553900434e-05, + "loss": 0.4945, + "step": 1321 + }, + { + "epoch": 1.398941798941799, + "grad_norm": 0.30870145935773974, + "learning_rate": 2.967463739709918e-05, + "loss": 0.4775, + "step": 1322 + }, + { + "epoch": 1.4, + "grad_norm": 0.22196743152262746, + "learning_rate": 2.9655037240297922e-05, + "loss": 0.457, + "step": 1323 + }, + { + "epoch": 1.4010582010582011, + "grad_norm": 0.23880933160360443, + "learning_rate": 2.963543708349667e-05, + "loss": 0.4267, + "step": 1324 + }, + { + "epoch": 1.402116402116402, + "grad_norm": 0.20702662843283434, + "learning_rate": 2.9615836926695417e-05, + "loss": 0.441, + "step": 1325 + }, + { + "epoch": 1.403174603174603, + "grad_norm": 0.2167350518937471, + "learning_rate": 2.959623676989416e-05, + "loss": 0.4408, + "step": 1326 + }, + { + "epoch": 1.4042328042328043, + "grad_norm": 0.24089833902821953, + "learning_rate": 2.9576636613092906e-05, + "loss": 0.4968, + "step": 1327 + }, + { + "epoch": 1.4052910052910053, + "grad_norm": 0.2241238478376523, + "learning_rate": 2.9557036456291655e-05, + "loss": 0.4517, + "step": 1328 + }, + { + "epoch": 1.4063492063492062, + "grad_norm": 0.22856799607317055, + "learning_rate": 2.9537436299490394e-05, + "loss": 0.4579, + "step": 1329 + }, + { + "epoch": 1.4074074074074074, + "grad_norm": 0.2268187851367138, + "learning_rate": 2.9517836142689143e-05, + "loss": 0.4649, + "step": 1330 + }, + { + "epoch": 1.4084656084656084, + "grad_norm": 0.22233549147880394, + "learning_rate": 2.949823598588789e-05, + "loss": 0.4465, + "step": 1331 + }, + { + "epoch": 1.4095238095238094, + "grad_norm": 0.2236855488492212, + "learning_rate": 2.947863582908663e-05, + "loss": 0.4228, + "step": 1332 + }, + { + "epoch": 1.4105820105820106, + "grad_norm": 0.23801323711217828, + "learning_rate": 2.9459035672285377e-05, + "loss": 0.4252, + "step": 1333 + }, + { + "epoch": 1.4116402116402116, + "grad_norm": 0.24798530897647444, + "learning_rate": 2.9439435515484127e-05, + "loss": 0.5064, + "step": 1334 + }, + { + "epoch": 1.4126984126984126, + "grad_norm": 0.23941337695722817, + "learning_rate": 2.9419835358682873e-05, + "loss": 0.4679, + "step": 1335 + }, + { + "epoch": 1.4137566137566138, + "grad_norm": 0.2623716098058726, + "learning_rate": 2.9400235201881615e-05, + "loss": 0.4473, + "step": 1336 + }, + { + "epoch": 1.4148148148148147, + "grad_norm": 0.21199389424437273, + "learning_rate": 2.938063504508036e-05, + "loss": 0.4225, + "step": 1337 + }, + { + "epoch": 1.415873015873016, + "grad_norm": 0.21541116818374437, + "learning_rate": 2.936103488827911e-05, + "loss": 0.4108, + "step": 1338 + }, + { + "epoch": 1.416931216931217, + "grad_norm": 0.24853955951034445, + "learning_rate": 2.9341434731477853e-05, + "loss": 0.447, + "step": 1339 + }, + { + "epoch": 1.417989417989418, + "grad_norm": 0.2215945031478815, + "learning_rate": 2.93218345746766e-05, + "loss": 0.4761, + "step": 1340 + }, + { + "epoch": 1.4190476190476191, + "grad_norm": 0.2133339228199975, + "learning_rate": 2.9302234417875344e-05, + "loss": 0.4524, + "step": 1341 + }, + { + "epoch": 1.42010582010582, + "grad_norm": 0.24231327015852916, + "learning_rate": 2.9282634261074087e-05, + "loss": 0.4629, + "step": 1342 + }, + { + "epoch": 1.4211640211640213, + "grad_norm": 0.21685366161636507, + "learning_rate": 2.9263034104272836e-05, + "loss": 0.4234, + "step": 1343 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.22253006527800145, + "learning_rate": 2.9243433947471582e-05, + "loss": 0.4323, + "step": 1344 + }, + { + "epoch": 1.4232804232804233, + "grad_norm": 0.22176071929381708, + "learning_rate": 2.9223833790670325e-05, + "loss": 0.4427, + "step": 1345 + }, + { + "epoch": 1.4243386243386245, + "grad_norm": 0.24232602640124465, + "learning_rate": 2.920423363386907e-05, + "loss": 0.3828, + "step": 1346 + }, + { + "epoch": 1.4253968253968254, + "grad_norm": 0.22581122208104695, + "learning_rate": 2.918463347706782e-05, + "loss": 0.4805, + "step": 1347 + }, + { + "epoch": 1.4264550264550264, + "grad_norm": 0.20026012333803486, + "learning_rate": 2.9165033320266566e-05, + "loss": 0.3919, + "step": 1348 + }, + { + "epoch": 1.4275132275132276, + "grad_norm": 0.22333787876129188, + "learning_rate": 2.9145433163465308e-05, + "loss": 0.4131, + "step": 1349 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.2687484229021416, + "learning_rate": 2.9125833006664054e-05, + "loss": 0.5264, + "step": 1350 + }, + { + "epoch": 1.4296296296296296, + "grad_norm": 0.2103685914143013, + "learning_rate": 2.9106232849862803e-05, + "loss": 0.4576, + "step": 1351 + }, + { + "epoch": 1.4306878306878308, + "grad_norm": 0.21771460457762667, + "learning_rate": 2.9086632693061546e-05, + "loss": 0.4361, + "step": 1352 + }, + { + "epoch": 1.4317460317460318, + "grad_norm": 0.20303241796561644, + "learning_rate": 2.906703253626029e-05, + "loss": 0.3971, + "step": 1353 + }, + { + "epoch": 1.4328042328042327, + "grad_norm": 0.20143577073807975, + "learning_rate": 2.9047432379459037e-05, + "loss": 0.3797, + "step": 1354 + }, + { + "epoch": 1.433862433862434, + "grad_norm": 0.2329009387387878, + "learning_rate": 2.902783222265778e-05, + "loss": 0.506, + "step": 1355 + }, + { + "epoch": 1.434920634920635, + "grad_norm": 0.2702723280262594, + "learning_rate": 2.900823206585653e-05, + "loss": 0.3775, + "step": 1356 + }, + { + "epoch": 1.435978835978836, + "grad_norm": 0.2834676972961095, + "learning_rate": 2.8988631909055275e-05, + "loss": 0.462, + "step": 1357 + }, + { + "epoch": 1.4370370370370371, + "grad_norm": 0.23065496781142314, + "learning_rate": 2.8969031752254017e-05, + "loss": 0.4994, + "step": 1358 + }, + { + "epoch": 1.438095238095238, + "grad_norm": 0.22768658403107855, + "learning_rate": 2.8949431595452763e-05, + "loss": 0.4218, + "step": 1359 + }, + { + "epoch": 1.439153439153439, + "grad_norm": 0.2680457798964453, + "learning_rate": 2.8929831438651513e-05, + "loss": 0.4527, + "step": 1360 + }, + { + "epoch": 1.4402116402116403, + "grad_norm": 0.2275408510504903, + "learning_rate": 2.891023128185026e-05, + "loss": 0.4718, + "step": 1361 + }, + { + "epoch": 1.4412698412698413, + "grad_norm": 0.23860010786360963, + "learning_rate": 2.8890631125049e-05, + "loss": 0.4656, + "step": 1362 + }, + { + "epoch": 1.4423280423280422, + "grad_norm": 0.5143958846694693, + "learning_rate": 2.8871030968247747e-05, + "loss": 0.4567, + "step": 1363 + }, + { + "epoch": 1.4433862433862434, + "grad_norm": 0.23216914200826835, + "learning_rate": 2.8851430811446496e-05, + "loss": 0.4873, + "step": 1364 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.25213100827470797, + "learning_rate": 2.8831830654645235e-05, + "loss": 0.4681, + "step": 1365 + }, + { + "epoch": 1.4455026455026454, + "grad_norm": 0.22446515285108198, + "learning_rate": 2.8812230497843984e-05, + "loss": 0.3967, + "step": 1366 + }, + { + "epoch": 1.4465608465608466, + "grad_norm": 0.22388321159591337, + "learning_rate": 2.879263034104273e-05, + "loss": 0.409, + "step": 1367 + }, + { + "epoch": 1.4476190476190476, + "grad_norm": 0.22054153685138833, + "learning_rate": 2.8773030184241473e-05, + "loss": 0.4393, + "step": 1368 + }, + { + "epoch": 1.4486772486772486, + "grad_norm": 0.23961761676424165, + "learning_rate": 2.875343002744022e-05, + "loss": 0.416, + "step": 1369 + }, + { + "epoch": 1.4497354497354498, + "grad_norm": 0.24967858263135762, + "learning_rate": 2.8733829870638968e-05, + "loss": 0.4897, + "step": 1370 + }, + { + "epoch": 1.4507936507936507, + "grad_norm": 0.23131162096590446, + "learning_rate": 2.8714229713837714e-05, + "loss": 0.4876, + "step": 1371 + }, + { + "epoch": 1.4518518518518517, + "grad_norm": 0.9779701550723804, + "learning_rate": 2.8694629557036456e-05, + "loss": 0.4304, + "step": 1372 + }, + { + "epoch": 1.452910052910053, + "grad_norm": 0.25630422299181627, + "learning_rate": 2.8675029400235202e-05, + "loss": 0.4348, + "step": 1373 + }, + { + "epoch": 1.453968253968254, + "grad_norm": 0.25784690293324475, + "learning_rate": 2.865542924343395e-05, + "loss": 0.4629, + "step": 1374 + }, + { + "epoch": 1.455026455026455, + "grad_norm": 0.27761793043496447, + "learning_rate": 2.8635829086632694e-05, + "loss": 0.4485, + "step": 1375 + }, + { + "epoch": 1.456084656084656, + "grad_norm": 0.26282171909723734, + "learning_rate": 2.861622892983144e-05, + "loss": 0.4981, + "step": 1376 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 0.7366784020256089, + "learning_rate": 2.8596628773030186e-05, + "loss": 0.4384, + "step": 1377 + }, + { + "epoch": 1.4582010582010583, + "grad_norm": 0.23451220676925627, + "learning_rate": 2.8577028616228928e-05, + "loss": 0.4342, + "step": 1378 + }, + { + "epoch": 1.4592592592592593, + "grad_norm": 0.2232768861934982, + "learning_rate": 2.8557428459427677e-05, + "loss": 0.4122, + "step": 1379 + }, + { + "epoch": 1.4603174603174602, + "grad_norm": 0.2330751792602911, + "learning_rate": 2.8537828302626423e-05, + "loss": 0.4716, + "step": 1380 + }, + { + "epoch": 1.4613756613756614, + "grad_norm": 0.2363839837673862, + "learning_rate": 2.8518228145825166e-05, + "loss": 0.4455, + "step": 1381 + }, + { + "epoch": 1.4624338624338624, + "grad_norm": 0.22199158535950497, + "learning_rate": 2.8498627989023912e-05, + "loss": 0.4017, + "step": 1382 + }, + { + "epoch": 1.4634920634920636, + "grad_norm": 0.2140913960403465, + "learning_rate": 2.847902783222266e-05, + "loss": 0.4171, + "step": 1383 + }, + { + "epoch": 1.4645502645502646, + "grad_norm": 0.21942455553450194, + "learning_rate": 2.8459427675421407e-05, + "loss": 0.4345, + "step": 1384 + }, + { + "epoch": 1.4656084656084656, + "grad_norm": 0.2589886529373693, + "learning_rate": 2.843982751862015e-05, + "loss": 0.4751, + "step": 1385 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.2161806830799717, + "learning_rate": 2.8420227361818895e-05, + "loss": 0.449, + "step": 1386 + }, + { + "epoch": 1.4677248677248678, + "grad_norm": 0.2155045931577355, + "learning_rate": 2.8400627205017644e-05, + "loss": 0.3958, + "step": 1387 + }, + { + "epoch": 1.4687830687830687, + "grad_norm": 34.14419147235214, + "learning_rate": 2.8381027048216387e-05, + "loss": 0.931, + "step": 1388 + }, + { + "epoch": 1.46984126984127, + "grad_norm": 0.2349647452936981, + "learning_rate": 2.8361426891415133e-05, + "loss": 0.4205, + "step": 1389 + }, + { + "epoch": 1.470899470899471, + "grad_norm": 0.4986074723963525, + "learning_rate": 2.834182673461388e-05, + "loss": 0.4416, + "step": 1390 + }, + { + "epoch": 1.471957671957672, + "grad_norm": 0.23543109220326247, + "learning_rate": 2.832222657781262e-05, + "loss": 0.4605, + "step": 1391 + }, + { + "epoch": 1.4730158730158731, + "grad_norm": 0.2041343822972172, + "learning_rate": 2.830262642101137e-05, + "loss": 0.4152, + "step": 1392 + }, + { + "epoch": 1.474074074074074, + "grad_norm": 0.22858488741114016, + "learning_rate": 2.8283026264210116e-05, + "loss": 0.4099, + "step": 1393 + }, + { + "epoch": 1.475132275132275, + "grad_norm": 0.2583668529158323, + "learning_rate": 2.826342610740886e-05, + "loss": 0.4569, + "step": 1394 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 0.2110916980288495, + "learning_rate": 2.8243825950607605e-05, + "loss": 0.4281, + "step": 1395 + }, + { + "epoch": 1.4772486772486773, + "grad_norm": 0.2335971618684065, + "learning_rate": 2.8224225793806354e-05, + "loss": 0.4363, + "step": 1396 + }, + { + "epoch": 1.4783068783068782, + "grad_norm": 0.24852243373430788, + "learning_rate": 2.82046256370051e-05, + "loss": 0.4723, + "step": 1397 + }, + { + "epoch": 1.4793650793650794, + "grad_norm": 1.7834274229889742, + "learning_rate": 2.8185025480203842e-05, + "loss": 0.4434, + "step": 1398 + }, + { + "epoch": 1.4804232804232804, + "grad_norm": 0.24286885099131655, + "learning_rate": 2.8165425323402588e-05, + "loss": 0.4564, + "step": 1399 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.2609544012499301, + "learning_rate": 2.8145825166601337e-05, + "loss": 0.4881, + "step": 1400 + }, + { + "epoch": 1.4825396825396826, + "grad_norm": 0.2078688963633995, + "learning_rate": 2.8126225009800077e-05, + "loss": 0.4035, + "step": 1401 + }, + { + "epoch": 1.4835978835978836, + "grad_norm": 0.2183047037176754, + "learning_rate": 2.8106624852998826e-05, + "loss": 0.4606, + "step": 1402 + }, + { + "epoch": 1.4846560846560846, + "grad_norm": 0.24125277886676127, + "learning_rate": 2.808702469619757e-05, + "loss": 0.3907, + "step": 1403 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.21563203906268566, + "learning_rate": 2.8067424539396314e-05, + "loss": 0.3794, + "step": 1404 + }, + { + "epoch": 1.4867724867724867, + "grad_norm": 0.23564668760181676, + "learning_rate": 2.804782438259506e-05, + "loss": 0.3883, + "step": 1405 + }, + { + "epoch": 1.4878306878306877, + "grad_norm": 0.22700381452133556, + "learning_rate": 2.802822422579381e-05, + "loss": 0.4516, + "step": 1406 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.21734610674555102, + "learning_rate": 2.8008624068992552e-05, + "loss": 0.4437, + "step": 1407 + }, + { + "epoch": 1.48994708994709, + "grad_norm": 0.23992886346228195, + "learning_rate": 2.7989023912191298e-05, + "loss": 0.4216, + "step": 1408 + }, + { + "epoch": 1.491005291005291, + "grad_norm": 0.2628261185467104, + "learning_rate": 2.7969423755390044e-05, + "loss": 0.5153, + "step": 1409 + }, + { + "epoch": 1.492063492063492, + "grad_norm": 0.2224824662272724, + "learning_rate": 2.7949823598588793e-05, + "loss": 0.4091, + "step": 1410 + }, + { + "epoch": 1.493121693121693, + "grad_norm": 0.23493481404403857, + "learning_rate": 2.7930223441787535e-05, + "loss": 0.4866, + "step": 1411 + }, + { + "epoch": 1.494179894179894, + "grad_norm": 0.2833535564343193, + "learning_rate": 2.791062328498628e-05, + "loss": 0.5036, + "step": 1412 + }, + { + "epoch": 1.4952380952380953, + "grad_norm": 0.2436710459859125, + "learning_rate": 2.7891023128185027e-05, + "loss": 0.4242, + "step": 1413 + }, + { + "epoch": 1.4962962962962962, + "grad_norm": 0.2137011724525208, + "learning_rate": 2.787142297138377e-05, + "loss": 0.4408, + "step": 1414 + }, + { + "epoch": 1.4973544973544972, + "grad_norm": 0.21652484554883622, + "learning_rate": 2.785182281458252e-05, + "loss": 0.3977, + "step": 1415 + }, + { + "epoch": 1.4984126984126984, + "grad_norm": 0.22029526293438814, + "learning_rate": 2.7832222657781265e-05, + "loss": 0.4157, + "step": 1416 + }, + { + "epoch": 1.4994708994708994, + "grad_norm": 1.4501431445796187, + "learning_rate": 2.7812622500980007e-05, + "loss": 0.5586, + "step": 1417 + }, + { + "epoch": 1.5005291005291004, + "grad_norm": 0.22081871178721485, + "learning_rate": 2.7793022344178753e-05, + "loss": 0.388, + "step": 1418 + }, + { + "epoch": 1.5015873015873016, + "grad_norm": 0.23512336150560495, + "learning_rate": 2.7773422187377502e-05, + "loss": 0.5023, + "step": 1419 + }, + { + "epoch": 1.5026455026455028, + "grad_norm": 0.20454204650582183, + "learning_rate": 2.7753822030576248e-05, + "loss": 0.4074, + "step": 1420 + }, + { + "epoch": 1.5037037037037035, + "grad_norm": 0.23959488063488388, + "learning_rate": 2.773422187377499e-05, + "loss": 0.408, + "step": 1421 + }, + { + "epoch": 1.5047619047619047, + "grad_norm": 0.21797011158561788, + "learning_rate": 2.7714621716973737e-05, + "loss": 0.4108, + "step": 1422 + }, + { + "epoch": 1.505820105820106, + "grad_norm": 0.2199227438643587, + "learning_rate": 2.7695021560172486e-05, + "loss": 0.4537, + "step": 1423 + }, + { + "epoch": 1.506878306878307, + "grad_norm": 0.2197835414815512, + "learning_rate": 2.7675421403371228e-05, + "loss": 0.4258, + "step": 1424 + }, + { + "epoch": 1.507936507936508, + "grad_norm": 0.2527966111155505, + "learning_rate": 2.7655821246569974e-05, + "loss": 0.5248, + "step": 1425 + }, + { + "epoch": 1.508994708994709, + "grad_norm": 0.2261829434017835, + "learning_rate": 2.763622108976872e-05, + "loss": 0.4856, + "step": 1426 + }, + { + "epoch": 1.51005291005291, + "grad_norm": 0.22385869251344861, + "learning_rate": 2.7616620932967463e-05, + "loss": 0.4547, + "step": 1427 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.23763869903387455, + "learning_rate": 2.7597020776166212e-05, + "loss": 0.4457, + "step": 1428 + }, + { + "epoch": 1.5121693121693123, + "grad_norm": 0.24360801662833237, + "learning_rate": 2.7577420619364958e-05, + "loss": 0.453, + "step": 1429 + }, + { + "epoch": 1.5132275132275133, + "grad_norm": 0.21514915559651115, + "learning_rate": 2.75578204625637e-05, + "loss": 0.4227, + "step": 1430 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.25074559433170956, + "learning_rate": 2.7538220305762446e-05, + "loss": 0.4814, + "step": 1431 + }, + { + "epoch": 1.5153439153439154, + "grad_norm": 0.2130270827244537, + "learning_rate": 2.7518620148961195e-05, + "loss": 0.4009, + "step": 1432 + }, + { + "epoch": 1.5164021164021164, + "grad_norm": 0.2152015038995373, + "learning_rate": 2.749901999215994e-05, + "loss": 0.3733, + "step": 1433 + }, + { + "epoch": 1.5174603174603174, + "grad_norm": 0.24155192005071274, + "learning_rate": 2.7479419835358684e-05, + "loss": 0.3632, + "step": 1434 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 0.2503955177301381, + "learning_rate": 2.745981967855743e-05, + "loss": 0.3957, + "step": 1435 + }, + { + "epoch": 1.5195767195767196, + "grad_norm": 0.21447061332958312, + "learning_rate": 2.744021952175618e-05, + "loss": 0.4501, + "step": 1436 + }, + { + "epoch": 1.5206349206349206, + "grad_norm": 0.2142050063037971, + "learning_rate": 2.7420619364954918e-05, + "loss": 0.4045, + "step": 1437 + }, + { + "epoch": 1.5216931216931218, + "grad_norm": 0.23469888425670518, + "learning_rate": 2.7401019208153667e-05, + "loss": 0.4069, + "step": 1438 + }, + { + "epoch": 1.5227513227513227, + "grad_norm": 0.2373104219367434, + "learning_rate": 2.7381419051352413e-05, + "loss": 0.4439, + "step": 1439 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.22931990044472228, + "learning_rate": 2.7361818894551155e-05, + "loss": 0.4105, + "step": 1440 + }, + { + "epoch": 1.524867724867725, + "grad_norm": 0.24468342878432897, + "learning_rate": 2.73422187377499e-05, + "loss": 0.4859, + "step": 1441 + }, + { + "epoch": 1.525925925925926, + "grad_norm": 0.23888600508604607, + "learning_rate": 2.732261858094865e-05, + "loss": 0.4692, + "step": 1442 + }, + { + "epoch": 1.5269841269841269, + "grad_norm": 0.2379674355949921, + "learning_rate": 2.7303018424147393e-05, + "loss": 0.4565, + "step": 1443 + }, + { + "epoch": 1.528042328042328, + "grad_norm": 0.21540467607096592, + "learning_rate": 2.728341826734614e-05, + "loss": 0.4183, + "step": 1444 + }, + { + "epoch": 1.529100529100529, + "grad_norm": 0.2185049084742189, + "learning_rate": 2.7263818110544885e-05, + "loss": 0.4343, + "step": 1445 + }, + { + "epoch": 1.53015873015873, + "grad_norm": 0.3364663153817796, + "learning_rate": 2.7244217953743634e-05, + "loss": 0.4324, + "step": 1446 + }, + { + "epoch": 1.5312169312169313, + "grad_norm": 0.2053796079743368, + "learning_rate": 2.7224617796942377e-05, + "loss": 0.3719, + "step": 1447 + }, + { + "epoch": 1.5322751322751322, + "grad_norm": 0.22693217543362984, + "learning_rate": 2.7205017640141122e-05, + "loss": 0.4356, + "step": 1448 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.27177436391000964, + "learning_rate": 2.718541748333987e-05, + "loss": 0.5063, + "step": 1449 + }, + { + "epoch": 1.5343915343915344, + "grad_norm": 0.2272754512480649, + "learning_rate": 2.716581732653861e-05, + "loss": 0.4487, + "step": 1450 + }, + { + "epoch": 1.5354497354497354, + "grad_norm": 0.2410315063196938, + "learning_rate": 2.714621716973736e-05, + "loss": 0.4438, + "step": 1451 + }, + { + "epoch": 1.5365079365079364, + "grad_norm": 0.25742696962818656, + "learning_rate": 2.7126617012936106e-05, + "loss": 0.4149, + "step": 1452 + }, + { + "epoch": 1.5375661375661376, + "grad_norm": 0.21908883154703712, + "learning_rate": 2.710701685613485e-05, + "loss": 0.4035, + "step": 1453 + }, + { + "epoch": 1.5386243386243388, + "grad_norm": 0.2720610233232513, + "learning_rate": 2.7087416699333594e-05, + "loss": 0.4956, + "step": 1454 + }, + { + "epoch": 1.5396825396825395, + "grad_norm": 0.25478178075039626, + "learning_rate": 2.7067816542532344e-05, + "loss": 0.4651, + "step": 1455 + }, + { + "epoch": 1.5407407407407407, + "grad_norm": 0.24946076321329155, + "learning_rate": 2.7048216385731083e-05, + "loss": 0.4157, + "step": 1456 + }, + { + "epoch": 1.541798941798942, + "grad_norm": 0.22111397690870904, + "learning_rate": 2.7028616228929832e-05, + "loss": 0.4587, + "step": 1457 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.2324140499446646, + "learning_rate": 2.7009016072128578e-05, + "loss": 0.405, + "step": 1458 + }, + { + "epoch": 1.543915343915344, + "grad_norm": 0.22732415750667306, + "learning_rate": 2.6989415915327327e-05, + "loss": 0.4055, + "step": 1459 + }, + { + "epoch": 1.544973544973545, + "grad_norm": 0.24012853054594216, + "learning_rate": 2.696981575852607e-05, + "loss": 0.4123, + "step": 1460 + }, + { + "epoch": 1.5460317460317459, + "grad_norm": 0.23424926428598633, + "learning_rate": 2.6950215601724815e-05, + "loss": 0.4752, + "step": 1461 + }, + { + "epoch": 1.547089947089947, + "grad_norm": 0.21660322408001612, + "learning_rate": 2.693061544492356e-05, + "loss": 0.4178, + "step": 1462 + }, + { + "epoch": 1.5481481481481483, + "grad_norm": 0.24724861696989267, + "learning_rate": 2.6911015288122304e-05, + "loss": 0.4814, + "step": 1463 + }, + { + "epoch": 1.5492063492063493, + "grad_norm": 2.4202839601143395, + "learning_rate": 2.6891415131321053e-05, + "loss": 0.4914, + "step": 1464 + }, + { + "epoch": 1.5502645502645502, + "grad_norm": 0.2730981950672889, + "learning_rate": 2.68718149745198e-05, + "loss": 0.4757, + "step": 1465 + }, + { + "epoch": 1.5513227513227514, + "grad_norm": 0.25371740302385765, + "learning_rate": 2.685221481771854e-05, + "loss": 0.4303, + "step": 1466 + }, + { + "epoch": 1.5523809523809524, + "grad_norm": 0.23514106327423817, + "learning_rate": 2.6832614660917287e-05, + "loss": 0.4854, + "step": 1467 + }, + { + "epoch": 1.5534391534391534, + "grad_norm": 0.2708610524635199, + "learning_rate": 2.6813014504116037e-05, + "loss": 0.4422, + "step": 1468 + }, + { + "epoch": 1.5544973544973546, + "grad_norm": 0.30542783481347546, + "learning_rate": 2.6793414347314782e-05, + "loss": 0.528, + "step": 1469 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.2966857021941013, + "learning_rate": 2.6773814190513525e-05, + "loss": 0.5021, + "step": 1470 + }, + { + "epoch": 1.5566137566137566, + "grad_norm": 0.2860444368837978, + "learning_rate": 2.675421403371227e-05, + "loss": 0.4251, + "step": 1471 + }, + { + "epoch": 1.5576719576719578, + "grad_norm": 0.28665886400785867, + "learning_rate": 2.673461387691102e-05, + "loss": 0.4435, + "step": 1472 + }, + { + "epoch": 1.5587301587301587, + "grad_norm": 0.2500775008406684, + "learning_rate": 2.671501372010976e-05, + "loss": 0.4658, + "step": 1473 + }, + { + "epoch": 1.5597883597883597, + "grad_norm": 0.22253440194972182, + "learning_rate": 2.669541356330851e-05, + "loss": 0.4646, + "step": 1474 + }, + { + "epoch": 1.560846560846561, + "grad_norm": 0.2146340288566786, + "learning_rate": 2.6675813406507254e-05, + "loss": 0.3825, + "step": 1475 + }, + { + "epoch": 1.561904761904762, + "grad_norm": 0.25064823580585577, + "learning_rate": 2.6656213249705997e-05, + "loss": 0.4584, + "step": 1476 + }, + { + "epoch": 1.5629629629629629, + "grad_norm": 0.2134463383511302, + "learning_rate": 2.6636613092904743e-05, + "loss": 0.3789, + "step": 1477 + }, + { + "epoch": 1.564021164021164, + "grad_norm": 0.2151371289165454, + "learning_rate": 2.6617012936103492e-05, + "loss": 0.4082, + "step": 1478 + }, + { + "epoch": 1.565079365079365, + "grad_norm": 0.211315815255196, + "learning_rate": 2.6597412779302234e-05, + "loss": 0.4444, + "step": 1479 + }, + { + "epoch": 1.566137566137566, + "grad_norm": 0.2638512019499202, + "learning_rate": 2.657781262250098e-05, + "loss": 0.4507, + "step": 1480 + }, + { + "epoch": 1.5671957671957673, + "grad_norm": 0.2249085194794952, + "learning_rate": 2.6558212465699726e-05, + "loss": 0.4262, + "step": 1481 + }, + { + "epoch": 1.5682539682539682, + "grad_norm": 0.22540466561989844, + "learning_rate": 2.6538612308898475e-05, + "loss": 0.4295, + "step": 1482 + }, + { + "epoch": 1.5693121693121692, + "grad_norm": 0.30665171807723735, + "learning_rate": 2.6519012152097218e-05, + "loss": 0.4453, + "step": 1483 + }, + { + "epoch": 1.5703703703703704, + "grad_norm": 0.22641322915874576, + "learning_rate": 2.6499411995295964e-05, + "loss": 0.4591, + "step": 1484 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.23344135403479024, + "learning_rate": 2.647981183849471e-05, + "loss": 0.4637, + "step": 1485 + }, + { + "epoch": 1.5724867724867724, + "grad_norm": 0.22588214050919211, + "learning_rate": 2.6460211681693452e-05, + "loss": 0.4211, + "step": 1486 + }, + { + "epoch": 1.5735449735449736, + "grad_norm": 0.21399920852193247, + "learning_rate": 2.64406115248922e-05, + "loss": 0.411, + "step": 1487 + }, + { + "epoch": 1.5746031746031746, + "grad_norm": 0.23919553348000056, + "learning_rate": 2.6421011368090947e-05, + "loss": 0.4556, + "step": 1488 + }, + { + "epoch": 1.5756613756613755, + "grad_norm": 0.2540160136398132, + "learning_rate": 2.640141121128969e-05, + "loss": 0.4276, + "step": 1489 + }, + { + "epoch": 1.5767195767195767, + "grad_norm": 0.21268692149914978, + "learning_rate": 2.6381811054488436e-05, + "loss": 0.4493, + "step": 1490 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 0.2504004750600532, + "learning_rate": 2.6362210897687185e-05, + "loss": 0.4475, + "step": 1491 + }, + { + "epoch": 1.5788359788359787, + "grad_norm": 0.23627610401779106, + "learning_rate": 2.6342610740885927e-05, + "loss": 0.4387, + "step": 1492 + }, + { + "epoch": 1.57989417989418, + "grad_norm": 0.24142802272346445, + "learning_rate": 2.6323010584084673e-05, + "loss": 0.4786, + "step": 1493 + }, + { + "epoch": 1.580952380952381, + "grad_norm": 0.2175227164819747, + "learning_rate": 2.630341042728342e-05, + "loss": 0.3726, + "step": 1494 + }, + { + "epoch": 1.5820105820105819, + "grad_norm": 0.2753522156824328, + "learning_rate": 2.628381027048217e-05, + "loss": 0.476, + "step": 1495 + }, + { + "epoch": 1.583068783068783, + "grad_norm": 0.23691656483032675, + "learning_rate": 2.626421011368091e-05, + "loss": 0.4573, + "step": 1496 + }, + { + "epoch": 1.5841269841269843, + "grad_norm": 0.2710431505452728, + "learning_rate": 2.6244609956879657e-05, + "loss": 0.4574, + "step": 1497 + }, + { + "epoch": 1.585185185185185, + "grad_norm": 0.23025660214601665, + "learning_rate": 2.6225009800078403e-05, + "loss": 0.4509, + "step": 1498 + }, + { + "epoch": 1.5862433862433862, + "grad_norm": 0.23275216490060904, + "learning_rate": 2.6205409643277145e-05, + "loss": 0.4509, + "step": 1499 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.2515703737282682, + "learning_rate": 2.6185809486475894e-05, + "loss": 0.418, + "step": 1500 + }, + { + "epoch": 1.5883597883597882, + "grad_norm": 0.22937595837034908, + "learning_rate": 2.616620932967464e-05, + "loss": 0.4749, + "step": 1501 + }, + { + "epoch": 1.5894179894179894, + "grad_norm": 0.22662452164402827, + "learning_rate": 2.6146609172873383e-05, + "loss": 0.431, + "step": 1502 + }, + { + "epoch": 1.5904761904761906, + "grad_norm": 0.24520042229444927, + "learning_rate": 2.612700901607213e-05, + "loss": 0.4334, + "step": 1503 + }, + { + "epoch": 1.5915343915343916, + "grad_norm": 0.21389765221210016, + "learning_rate": 2.6107408859270878e-05, + "loss": 0.3985, + "step": 1504 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.2520836403046001, + "learning_rate": 2.6087808702469617e-05, + "loss": 0.4952, + "step": 1505 + }, + { + "epoch": 1.5936507936507938, + "grad_norm": 0.2590281741764871, + "learning_rate": 2.6068208545668366e-05, + "loss": 0.4153, + "step": 1506 + }, + { + "epoch": 1.5947089947089947, + "grad_norm": 0.24382084508627283, + "learning_rate": 2.6048608388867112e-05, + "loss": 0.428, + "step": 1507 + }, + { + "epoch": 1.5957671957671957, + "grad_norm": 0.2738433714268017, + "learning_rate": 2.602900823206586e-05, + "loss": 0.5297, + "step": 1508 + }, + { + "epoch": 1.596825396825397, + "grad_norm": 0.24412194302796253, + "learning_rate": 2.60094080752646e-05, + "loss": 0.4653, + "step": 1509 + }, + { + "epoch": 1.597883597883598, + "grad_norm": 0.24762704362472204, + "learning_rate": 2.598980791846335e-05, + "loss": 0.4746, + "step": 1510 + }, + { + "epoch": 1.5989417989417989, + "grad_norm": 0.21515171581042802, + "learning_rate": 2.5970207761662096e-05, + "loss": 0.4151, + "step": 1511 + }, + { + "epoch": 1.6, + "grad_norm": 0.2500799916715579, + "learning_rate": 2.5950607604860838e-05, + "loss": 0.417, + "step": 1512 + }, + { + "epoch": 1.601058201058201, + "grad_norm": 0.2381688667695587, + "learning_rate": 2.5931007448059584e-05, + "loss": 0.4448, + "step": 1513 + }, + { + "epoch": 1.602116402116402, + "grad_norm": 0.24191214514605877, + "learning_rate": 2.5911407291258333e-05, + "loss": 0.4683, + "step": 1514 + }, + { + "epoch": 1.6031746031746033, + "grad_norm": 0.23784184704388528, + "learning_rate": 2.5891807134457076e-05, + "loss": 0.4265, + "step": 1515 + }, + { + "epoch": 1.6042328042328042, + "grad_norm": 0.2734834110756681, + "learning_rate": 2.587220697765582e-05, + "loss": 0.439, + "step": 1516 + }, + { + "epoch": 1.6052910052910052, + "grad_norm": 0.25396515727811014, + "learning_rate": 2.5852606820854568e-05, + "loss": 0.461, + "step": 1517 + }, + { + "epoch": 1.6063492063492064, + "grad_norm": 0.242809898846282, + "learning_rate": 2.5833006664053317e-05, + "loss": 0.4573, + "step": 1518 + }, + { + "epoch": 1.6074074074074074, + "grad_norm": 0.24330748961980134, + "learning_rate": 2.581340650725206e-05, + "loss": 0.4954, + "step": 1519 + }, + { + "epoch": 1.6084656084656084, + "grad_norm": 0.27485852795834315, + "learning_rate": 2.5793806350450805e-05, + "loss": 0.428, + "step": 1520 + }, + { + "epoch": 1.6095238095238096, + "grad_norm": 0.2445502116760993, + "learning_rate": 2.577420619364955e-05, + "loss": 0.4789, + "step": 1521 + }, + { + "epoch": 1.6105820105820106, + "grad_norm": 0.20220337369742722, + "learning_rate": 2.5754606036848293e-05, + "loss": 0.4299, + "step": 1522 + }, + { + "epoch": 1.6116402116402115, + "grad_norm": 0.2587743856048611, + "learning_rate": 2.5735005880047043e-05, + "loss": 0.4562, + "step": 1523 + }, + { + "epoch": 1.6126984126984127, + "grad_norm": 0.24738675168343857, + "learning_rate": 2.571540572324579e-05, + "loss": 0.4662, + "step": 1524 + }, + { + "epoch": 1.6137566137566137, + "grad_norm": 0.24385074926826564, + "learning_rate": 2.569580556644453e-05, + "loss": 0.5195, + "step": 1525 + }, + { + "epoch": 1.6148148148148147, + "grad_norm": 0.23106098933576363, + "learning_rate": 2.5676205409643277e-05, + "loss": 0.4307, + "step": 1526 + }, + { + "epoch": 1.615873015873016, + "grad_norm": 0.2717986108900208, + "learning_rate": 2.5656605252842026e-05, + "loss": 0.4243, + "step": 1527 + }, + { + "epoch": 1.6169312169312169, + "grad_norm": 0.23631217436069807, + "learning_rate": 2.563700509604077e-05, + "loss": 0.4001, + "step": 1528 + }, + { + "epoch": 1.6179894179894179, + "grad_norm": 0.21316396332645726, + "learning_rate": 2.5617404939239515e-05, + "loss": 0.4231, + "step": 1529 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 0.2529199744670375, + "learning_rate": 2.559780478243826e-05, + "loss": 0.4383, + "step": 1530 + }, + { + "epoch": 1.6201058201058203, + "grad_norm": 0.25696462723939195, + "learning_rate": 2.557820462563701e-05, + "loss": 0.4237, + "step": 1531 + }, + { + "epoch": 1.621164021164021, + "grad_norm": 0.21751147659743908, + "learning_rate": 2.5558604468835752e-05, + "loss": 0.4341, + "step": 1532 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.23705868496582927, + "learning_rate": 2.5539004312034498e-05, + "loss": 0.4627, + "step": 1533 + }, + { + "epoch": 1.6232804232804234, + "grad_norm": 0.228268665473131, + "learning_rate": 2.5519404155233244e-05, + "loss": 0.4752, + "step": 1534 + }, + { + "epoch": 1.6243386243386242, + "grad_norm": 0.22482394544205297, + "learning_rate": 2.5499803998431986e-05, + "loss": 0.3858, + "step": 1535 + }, + { + "epoch": 1.6253968253968254, + "grad_norm": 0.22061168915586668, + "learning_rate": 2.5480203841630736e-05, + "loss": 0.4521, + "step": 1536 + }, + { + "epoch": 1.6264550264550266, + "grad_norm": 0.2187772213768095, + "learning_rate": 2.546060368482948e-05, + "loss": 0.4239, + "step": 1537 + }, + { + "epoch": 1.6275132275132274, + "grad_norm": 0.24727489561841617, + "learning_rate": 2.5441003528028224e-05, + "loss": 0.4374, + "step": 1538 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 0.22113247609707468, + "learning_rate": 2.542140337122697e-05, + "loss": 0.4134, + "step": 1539 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 0.19784291165488221, + "learning_rate": 2.540180321442572e-05, + "loss": 0.4032, + "step": 1540 + }, + { + "epoch": 1.6306878306878307, + "grad_norm": 0.2619577804116681, + "learning_rate": 2.538220305762446e-05, + "loss": 0.4637, + "step": 1541 + }, + { + "epoch": 1.6317460317460317, + "grad_norm": 0.22538233425794543, + "learning_rate": 2.5362602900823208e-05, + "loss": 0.4441, + "step": 1542 + }, + { + "epoch": 1.632804232804233, + "grad_norm": 0.24391278793008334, + "learning_rate": 2.5343002744021953e-05, + "loss": 0.4614, + "step": 1543 + }, + { + "epoch": 1.633862433862434, + "grad_norm": 0.20585304887187394, + "learning_rate": 2.5323402587220703e-05, + "loss": 0.4181, + "step": 1544 + }, + { + "epoch": 1.6349206349206349, + "grad_norm": 0.22818882395320167, + "learning_rate": 2.5303802430419442e-05, + "loss": 0.4149, + "step": 1545 + }, + { + "epoch": 1.635978835978836, + "grad_norm": 0.2350090079830974, + "learning_rate": 2.528420227361819e-05, + "loss": 0.4735, + "step": 1546 + }, + { + "epoch": 1.637037037037037, + "grad_norm": 0.22082978235289127, + "learning_rate": 2.5264602116816937e-05, + "loss": 0.4501, + "step": 1547 + }, + { + "epoch": 1.638095238095238, + "grad_norm": 0.21032695498106663, + "learning_rate": 2.524500196001568e-05, + "loss": 0.4295, + "step": 1548 + }, + { + "epoch": 1.6391534391534393, + "grad_norm": 0.21324437007116495, + "learning_rate": 2.5225401803214425e-05, + "loss": 0.3839, + "step": 1549 + }, + { + "epoch": 1.6402116402116402, + "grad_norm": 0.2389234288277766, + "learning_rate": 2.5205801646413175e-05, + "loss": 0.4673, + "step": 1550 + }, + { + "epoch": 1.6412698412698412, + "grad_norm": 0.20583896765782994, + "learning_rate": 2.5186201489611917e-05, + "loss": 0.4092, + "step": 1551 + }, + { + "epoch": 1.6423280423280424, + "grad_norm": 0.22499957708636312, + "learning_rate": 2.5166601332810663e-05, + "loss": 0.4721, + "step": 1552 + }, + { + "epoch": 1.6433862433862434, + "grad_norm": 0.8465391318181628, + "learning_rate": 2.514700117600941e-05, + "loss": 0.4525, + "step": 1553 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.2196056190105969, + "learning_rate": 2.512740101920815e-05, + "loss": 0.3989, + "step": 1554 + }, + { + "epoch": 1.6455026455026456, + "grad_norm": 0.2638083915277794, + "learning_rate": 2.51078008624069e-05, + "loss": 0.4352, + "step": 1555 + }, + { + "epoch": 1.6465608465608466, + "grad_norm": 0.25435791435225086, + "learning_rate": 2.5088200705605646e-05, + "loss": 0.4399, + "step": 1556 + }, + { + "epoch": 1.6476190476190475, + "grad_norm": 0.23162932695663496, + "learning_rate": 2.5068600548804392e-05, + "loss": 0.4378, + "step": 1557 + }, + { + "epoch": 1.6486772486772487, + "grad_norm": 0.24325491659040138, + "learning_rate": 2.5049000392003135e-05, + "loss": 0.4726, + "step": 1558 + }, + { + "epoch": 1.6497354497354497, + "grad_norm": 0.2692936406441785, + "learning_rate": 2.5029400235201884e-05, + "loss": 0.503, + "step": 1559 + }, + { + "epoch": 1.6507936507936507, + "grad_norm": 0.23044637601493173, + "learning_rate": 2.500980007840063e-05, + "loss": 0.4153, + "step": 1560 + }, + { + "epoch": 1.651851851851852, + "grad_norm": 0.19978472969279348, + "learning_rate": 2.4990199921599376e-05, + "loss": 0.4189, + "step": 1561 + }, + { + "epoch": 1.6529100529100529, + "grad_norm": 2.0130289348317545, + "learning_rate": 2.497059976479812e-05, + "loss": 0.5285, + "step": 1562 + }, + { + "epoch": 1.6539682539682539, + "grad_norm": 0.6382100207364493, + "learning_rate": 2.4950999607996864e-05, + "loss": 0.498, + "step": 1563 + }, + { + "epoch": 1.655026455026455, + "grad_norm": 0.2873980573844794, + "learning_rate": 2.493139945119561e-05, + "loss": 0.4345, + "step": 1564 + }, + { + "epoch": 1.656084656084656, + "grad_norm": 0.2238132320590369, + "learning_rate": 2.4911799294394356e-05, + "loss": 0.4107, + "step": 1565 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 0.22847079158253494, + "learning_rate": 2.4892199137593102e-05, + "loss": 0.4377, + "step": 1566 + }, + { + "epoch": 1.6582010582010582, + "grad_norm": 0.27256045565172915, + "learning_rate": 2.4872598980791848e-05, + "loss": 0.4313, + "step": 1567 + }, + { + "epoch": 1.6592592592592592, + "grad_norm": 0.2586293011878613, + "learning_rate": 2.4852998823990594e-05, + "loss": 0.4437, + "step": 1568 + }, + { + "epoch": 1.6603174603174602, + "grad_norm": 0.22800723424295755, + "learning_rate": 2.483339866718934e-05, + "loss": 0.4273, + "step": 1569 + }, + { + "epoch": 1.6613756613756614, + "grad_norm": 0.24485218448636267, + "learning_rate": 2.4813798510388085e-05, + "loss": 0.4393, + "step": 1570 + }, + { + "epoch": 1.6624338624338626, + "grad_norm": 0.2484620989572041, + "learning_rate": 2.479419835358683e-05, + "loss": 0.3975, + "step": 1571 + }, + { + "epoch": 1.6634920634920634, + "grad_norm": 0.2124965525509564, + "learning_rate": 2.4774598196785577e-05, + "loss": 0.4508, + "step": 1572 + }, + { + "epoch": 1.6645502645502646, + "grad_norm": 0.24457740342964363, + "learning_rate": 2.475499803998432e-05, + "loss": 0.4299, + "step": 1573 + }, + { + "epoch": 1.6656084656084658, + "grad_norm": 0.25001350248928084, + "learning_rate": 2.473539788318307e-05, + "loss": 0.4924, + "step": 1574 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.2307802816899225, + "learning_rate": 2.471579772638181e-05, + "loss": 0.4466, + "step": 1575 + }, + { + "epoch": 1.6677248677248677, + "grad_norm": 0.22210528313961542, + "learning_rate": 2.469619756958056e-05, + "loss": 0.404, + "step": 1576 + }, + { + "epoch": 1.668783068783069, + "grad_norm": 0.21443376857710403, + "learning_rate": 2.4676597412779303e-05, + "loss": 0.4274, + "step": 1577 + }, + { + "epoch": 1.6698412698412697, + "grad_norm": 0.22624353684944185, + "learning_rate": 2.465699725597805e-05, + "loss": 0.436, + "step": 1578 + }, + { + "epoch": 1.6708994708994709, + "grad_norm": 0.23099050262288628, + "learning_rate": 2.4637397099176795e-05, + "loss": 0.4582, + "step": 1579 + }, + { + "epoch": 1.671957671957672, + "grad_norm": 0.20700052537048644, + "learning_rate": 2.461779694237554e-05, + "loss": 0.4105, + "step": 1580 + }, + { + "epoch": 1.673015873015873, + "grad_norm": 0.21408619017989325, + "learning_rate": 2.4598196785574283e-05, + "loss": 0.4435, + "step": 1581 + }, + { + "epoch": 1.674074074074074, + "grad_norm": 0.22932373910076814, + "learning_rate": 2.4578596628773032e-05, + "loss": 0.4423, + "step": 1582 + }, + { + "epoch": 1.6751322751322753, + "grad_norm": 0.2701302455070178, + "learning_rate": 2.4558996471971775e-05, + "loss": 0.491, + "step": 1583 + }, + { + "epoch": 1.6761904761904762, + "grad_norm": 0.22544337265587341, + "learning_rate": 2.4539396315170524e-05, + "loss": 0.4743, + "step": 1584 + }, + { + "epoch": 1.6772486772486772, + "grad_norm": 0.21709705988111216, + "learning_rate": 2.4519796158369267e-05, + "loss": 0.4065, + "step": 1585 + }, + { + "epoch": 1.6783068783068784, + "grad_norm": 0.24996850315949026, + "learning_rate": 2.4500196001568013e-05, + "loss": 0.4375, + "step": 1586 + }, + { + "epoch": 1.6793650793650794, + "grad_norm": 0.9469921058616022, + "learning_rate": 2.448059584476676e-05, + "loss": 0.4191, + "step": 1587 + }, + { + "epoch": 1.6804232804232804, + "grad_norm": 0.22085668242729445, + "learning_rate": 2.4460995687965504e-05, + "loss": 0.48, + "step": 1588 + }, + { + "epoch": 1.6814814814814816, + "grad_norm": 0.23494830806105532, + "learning_rate": 2.444139553116425e-05, + "loss": 0.4781, + "step": 1589 + }, + { + "epoch": 1.6825396825396826, + "grad_norm": 0.26542532348970826, + "learning_rate": 2.4421795374362996e-05, + "loss": 0.4089, + "step": 1590 + }, + { + "epoch": 1.6835978835978835, + "grad_norm": 0.25254211616363226, + "learning_rate": 2.4402195217561742e-05, + "loss": 0.4629, + "step": 1591 + }, + { + "epoch": 1.6846560846560847, + "grad_norm": 0.2273568902013295, + "learning_rate": 2.4382595060760488e-05, + "loss": 0.4592, + "step": 1592 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.2594410680121094, + "learning_rate": 2.4362994903959234e-05, + "loss": 0.439, + "step": 1593 + }, + { + "epoch": 1.6867724867724867, + "grad_norm": 0.2274722145506129, + "learning_rate": 2.4343394747157976e-05, + "loss": 0.4246, + "step": 1594 + }, + { + "epoch": 1.687830687830688, + "grad_norm": 0.2400296192105033, + "learning_rate": 2.4323794590356725e-05, + "loss": 0.4985, + "step": 1595 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.2335755377011187, + "learning_rate": 2.4304194433555468e-05, + "loss": 0.5176, + "step": 1596 + }, + { + "epoch": 1.6899470899470899, + "grad_norm": 0.22541799468462043, + "learning_rate": 2.4284594276754217e-05, + "loss": 0.391, + "step": 1597 + }, + { + "epoch": 1.691005291005291, + "grad_norm": 0.25348107176315676, + "learning_rate": 2.426499411995296e-05, + "loss": 0.4718, + "step": 1598 + }, + { + "epoch": 1.692063492063492, + "grad_norm": 0.2134398598463444, + "learning_rate": 2.4245393963151706e-05, + "loss": 0.4563, + "step": 1599 + }, + { + "epoch": 1.693121693121693, + "grad_norm": 0.2799436267035962, + "learning_rate": 2.422579380635045e-05, + "loss": 0.4554, + "step": 1600 + }, + { + "epoch": 1.6941798941798942, + "grad_norm": 0.2239050910441869, + "learning_rate": 2.4206193649549197e-05, + "loss": 0.4303, + "step": 1601 + }, + { + "epoch": 1.6952380952380952, + "grad_norm": 5.342628537787954, + "learning_rate": 2.4186593492747943e-05, + "loss": 0.4961, + "step": 1602 + }, + { + "epoch": 1.6962962962962962, + "grad_norm": 1.9189218328198483, + "learning_rate": 2.416699333594669e-05, + "loss": 0.5465, + "step": 1603 + }, + { + "epoch": 1.6973544973544974, + "grad_norm": 0.22908151431119422, + "learning_rate": 2.4147393179145435e-05, + "loss": 0.4628, + "step": 1604 + }, + { + "epoch": 1.6984126984126984, + "grad_norm": 0.20485996984971452, + "learning_rate": 2.412779302234418e-05, + "loss": 0.4013, + "step": 1605 + }, + { + "epoch": 1.6994708994708994, + "grad_norm": 0.21384044603948377, + "learning_rate": 2.4108192865542927e-05, + "loss": 0.4111, + "step": 1606 + }, + { + "epoch": 1.7005291005291006, + "grad_norm": 0.20944814894771038, + "learning_rate": 2.4088592708741673e-05, + "loss": 0.4487, + "step": 1607 + }, + { + "epoch": 1.7015873015873015, + "grad_norm": 0.21567540070670468, + "learning_rate": 2.406899255194042e-05, + "loss": 0.4157, + "step": 1608 + }, + { + "epoch": 1.7026455026455025, + "grad_norm": 0.21951670349669852, + "learning_rate": 2.404939239513916e-05, + "loss": 0.3811, + "step": 1609 + }, + { + "epoch": 1.7037037037037037, + "grad_norm": 0.21975151815133592, + "learning_rate": 2.402979223833791e-05, + "loss": 0.4788, + "step": 1610 + }, + { + "epoch": 1.704761904761905, + "grad_norm": 0.20831580759962212, + "learning_rate": 2.4010192081536653e-05, + "loss": 0.4237, + "step": 1611 + }, + { + "epoch": 1.7058201058201057, + "grad_norm": 0.20479233675329178, + "learning_rate": 2.39905919247354e-05, + "loss": 0.4125, + "step": 1612 + }, + { + "epoch": 1.7068783068783069, + "grad_norm": 0.21043974195059237, + "learning_rate": 2.3970991767934144e-05, + "loss": 0.4114, + "step": 1613 + }, + { + "epoch": 1.707936507936508, + "grad_norm": 0.19363060033043297, + "learning_rate": 2.395139161113289e-05, + "loss": 0.3958, + "step": 1614 + }, + { + "epoch": 1.7089947089947088, + "grad_norm": 0.2203472309140231, + "learning_rate": 2.3931791454331636e-05, + "loss": 0.4136, + "step": 1615 + }, + { + "epoch": 1.71005291005291, + "grad_norm": 0.27731224351626677, + "learning_rate": 2.3912191297530382e-05, + "loss": 0.428, + "step": 1616 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.2165312271941672, + "learning_rate": 2.3892591140729124e-05, + "loss": 0.4455, + "step": 1617 + }, + { + "epoch": 1.712169312169312, + "grad_norm": 0.21755495769093408, + "learning_rate": 2.3872990983927874e-05, + "loss": 0.4031, + "step": 1618 + }, + { + "epoch": 1.7132275132275132, + "grad_norm": 0.24222900168163267, + "learning_rate": 2.3853390827126616e-05, + "loss": 0.4786, + "step": 1619 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.23418054019764165, + "learning_rate": 2.3833790670325365e-05, + "loss": 0.4608, + "step": 1620 + }, + { + "epoch": 1.7153439153439154, + "grad_norm": 0.21694499784225765, + "learning_rate": 2.3814190513524108e-05, + "loss": 0.4283, + "step": 1621 + }, + { + "epoch": 1.7164021164021164, + "grad_norm": 0.2546334351383069, + "learning_rate": 2.3794590356722854e-05, + "loss": 0.4606, + "step": 1622 + }, + { + "epoch": 1.7174603174603176, + "grad_norm": 0.2319584310490375, + "learning_rate": 2.37749901999216e-05, + "loss": 0.451, + "step": 1623 + }, + { + "epoch": 1.7185185185185186, + "grad_norm": 0.24477601621399064, + "learning_rate": 2.3755390043120346e-05, + "loss": 0.5059, + "step": 1624 + }, + { + "epoch": 1.7195767195767195, + "grad_norm": 0.36838675048518105, + "learning_rate": 2.373578988631909e-05, + "loss": 0.4285, + "step": 1625 + }, + { + "epoch": 1.7206349206349207, + "grad_norm": 0.2466732852047628, + "learning_rate": 2.3716189729517837e-05, + "loss": 0.4437, + "step": 1626 + }, + { + "epoch": 1.7216931216931217, + "grad_norm": 0.2135957001631125, + "learning_rate": 2.3696589572716583e-05, + "loss": 0.421, + "step": 1627 + }, + { + "epoch": 1.7227513227513227, + "grad_norm": 0.24026169452843113, + "learning_rate": 2.367698941591533e-05, + "loss": 0.4532, + "step": 1628 + }, + { + "epoch": 1.723809523809524, + "grad_norm": 0.22254374311366967, + "learning_rate": 2.3657389259114075e-05, + "loss": 0.4547, + "step": 1629 + }, + { + "epoch": 1.7248677248677249, + "grad_norm": 0.2379541965307219, + "learning_rate": 2.3637789102312817e-05, + "loss": 0.4735, + "step": 1630 + }, + { + "epoch": 1.7259259259259259, + "grad_norm": 0.21785973896658944, + "learning_rate": 2.3618188945511567e-05, + "loss": 0.4757, + "step": 1631 + }, + { + "epoch": 1.726984126984127, + "grad_norm": 0.20413785133119947, + "learning_rate": 2.359858878871031e-05, + "loss": 0.4213, + "step": 1632 + }, + { + "epoch": 1.728042328042328, + "grad_norm": 0.25684940141007967, + "learning_rate": 2.357898863190906e-05, + "loss": 0.4839, + "step": 1633 + }, + { + "epoch": 1.729100529100529, + "grad_norm": 0.22575583816002107, + "learning_rate": 2.35593884751078e-05, + "loss": 0.4319, + "step": 1634 + }, + { + "epoch": 1.7301587301587302, + "grad_norm": 0.2102700377358579, + "learning_rate": 2.3539788318306547e-05, + "loss": 0.4465, + "step": 1635 + }, + { + "epoch": 1.7312169312169312, + "grad_norm": 0.23871894363562643, + "learning_rate": 2.3520188161505293e-05, + "loss": 0.4692, + "step": 1636 + }, + { + "epoch": 1.7322751322751322, + "grad_norm": 0.2257335899512697, + "learning_rate": 2.350058800470404e-05, + "loss": 0.4308, + "step": 1637 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.21199670925667166, + "learning_rate": 2.3480987847902784e-05, + "loss": 0.4399, + "step": 1638 + }, + { + "epoch": 1.7343915343915344, + "grad_norm": 0.2637943802055583, + "learning_rate": 2.346138769110153e-05, + "loss": 0.469, + "step": 1639 + }, + { + "epoch": 1.7354497354497354, + "grad_norm": 0.2091292729089544, + "learning_rate": 2.3441787534300276e-05, + "loss": 0.4079, + "step": 1640 + }, + { + "epoch": 1.7365079365079366, + "grad_norm": 0.21242035801784248, + "learning_rate": 2.3422187377499022e-05, + "loss": 0.433, + "step": 1641 + }, + { + "epoch": 1.7375661375661375, + "grad_norm": 0.21411367696110664, + "learning_rate": 2.3402587220697768e-05, + "loss": 0.445, + "step": 1642 + }, + { + "epoch": 1.7386243386243385, + "grad_norm": 0.22666171986236078, + "learning_rate": 2.338298706389651e-05, + "loss": 0.4352, + "step": 1643 + }, + { + "epoch": 1.7396825396825397, + "grad_norm": 0.2154746362675361, + "learning_rate": 2.336338690709526e-05, + "loss": 0.4225, + "step": 1644 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.20835776969269476, + "learning_rate": 2.3343786750294002e-05, + "loss": 0.423, + "step": 1645 + }, + { + "epoch": 1.7417989417989417, + "grad_norm": 0.22888559147228213, + "learning_rate": 2.332418659349275e-05, + "loss": 0.4047, + "step": 1646 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 0.2211280651344821, + "learning_rate": 2.3304586436691494e-05, + "loss": 0.3904, + "step": 1647 + }, + { + "epoch": 1.7439153439153439, + "grad_norm": 0.23013001218859871, + "learning_rate": 2.328498627989024e-05, + "loss": 0.4983, + "step": 1648 + }, + { + "epoch": 1.7449735449735448, + "grad_norm": 0.2202927426992636, + "learning_rate": 2.3265386123088986e-05, + "loss": 0.4676, + "step": 1649 + }, + { + "epoch": 1.746031746031746, + "grad_norm": 0.22953761733384298, + "learning_rate": 2.324578596628773e-05, + "loss": 0.45, + "step": 1650 + }, + { + "epoch": 1.7470899470899472, + "grad_norm": 0.22282806910302103, + "learning_rate": 2.3226185809486477e-05, + "loss": 0.4218, + "step": 1651 + }, + { + "epoch": 1.748148148148148, + "grad_norm": 0.23634054688203343, + "learning_rate": 2.3206585652685223e-05, + "loss": 0.464, + "step": 1652 + }, + { + "epoch": 1.7492063492063492, + "grad_norm": 0.22465725068012996, + "learning_rate": 2.3186985495883966e-05, + "loss": 0.448, + "step": 1653 + }, + { + "epoch": 1.7502645502645504, + "grad_norm": 0.23369312668049108, + "learning_rate": 2.3167385339082715e-05, + "loss": 0.4475, + "step": 1654 + }, + { + "epoch": 1.7513227513227512, + "grad_norm": 0.21668131882841285, + "learning_rate": 2.3147785182281458e-05, + "loss": 0.4246, + "step": 1655 + }, + { + "epoch": 1.7523809523809524, + "grad_norm": 0.22432886200596663, + "learning_rate": 2.3128185025480207e-05, + "loss": 0.4726, + "step": 1656 + }, + { + "epoch": 1.7534391534391536, + "grad_norm": 0.23217067772618433, + "learning_rate": 2.310858486867895e-05, + "loss": 0.4217, + "step": 1657 + }, + { + "epoch": 1.7544973544973543, + "grad_norm": 0.23998057651431876, + "learning_rate": 2.3088984711877695e-05, + "loss": 0.4478, + "step": 1658 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.20789843322357746, + "learning_rate": 2.306938455507644e-05, + "loss": 0.4093, + "step": 1659 + }, + { + "epoch": 1.7566137566137567, + "grad_norm": 0.18321420188469675, + "learning_rate": 2.3049784398275187e-05, + "loss": 0.3687, + "step": 1660 + }, + { + "epoch": 1.7576719576719577, + "grad_norm": 0.24493094649795308, + "learning_rate": 2.3030184241473933e-05, + "loss": 0.4494, + "step": 1661 + }, + { + "epoch": 1.7587301587301587, + "grad_norm": 0.2091468538339814, + "learning_rate": 2.301058408467268e-05, + "loss": 0.4199, + "step": 1662 + }, + { + "epoch": 1.75978835978836, + "grad_norm": 0.2071184685848974, + "learning_rate": 2.2990983927871425e-05, + "loss": 0.4324, + "step": 1663 + }, + { + "epoch": 1.7608465608465609, + "grad_norm": 0.24913461114215388, + "learning_rate": 2.297138377107017e-05, + "loss": 0.467, + "step": 1664 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.2391544344366436, + "learning_rate": 2.2951783614268916e-05, + "loss": 0.4312, + "step": 1665 + }, + { + "epoch": 1.762962962962963, + "grad_norm": 0.22154350725206487, + "learning_rate": 2.293218345746766e-05, + "loss": 0.4597, + "step": 1666 + }, + { + "epoch": 1.764021164021164, + "grad_norm": 0.24501309931712809, + "learning_rate": 2.2912583300666408e-05, + "loss": 0.459, + "step": 1667 + }, + { + "epoch": 1.765079365079365, + "grad_norm": 0.25526129904002803, + "learning_rate": 2.289298314386515e-05, + "loss": 0.4835, + "step": 1668 + }, + { + "epoch": 1.7661375661375662, + "grad_norm": 0.21646297720766336, + "learning_rate": 2.28733829870639e-05, + "loss": 0.4335, + "step": 1669 + }, + { + "epoch": 1.7671957671957672, + "grad_norm": 0.22733884215028632, + "learning_rate": 2.2853782830262642e-05, + "loss": 0.4552, + "step": 1670 + }, + { + "epoch": 1.7682539682539682, + "grad_norm": 0.2075067401096603, + "learning_rate": 2.2834182673461388e-05, + "loss": 0.4138, + "step": 1671 + }, + { + "epoch": 1.7693121693121694, + "grad_norm": 0.2257968183277266, + "learning_rate": 2.2814582516660134e-05, + "loss": 0.5253, + "step": 1672 + }, + { + "epoch": 1.7703703703703704, + "grad_norm": 0.21533345478212504, + "learning_rate": 2.279498235985888e-05, + "loss": 0.4942, + "step": 1673 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.22258782531542667, + "learning_rate": 2.2775382203057626e-05, + "loss": 0.4511, + "step": 1674 + }, + { + "epoch": 1.7724867724867726, + "grad_norm": 0.2323804687576553, + "learning_rate": 2.275578204625637e-05, + "loss": 0.439, + "step": 1675 + }, + { + "epoch": 1.7735449735449735, + "grad_norm": 0.22142223429635433, + "learning_rate": 2.2736181889455118e-05, + "loss": 0.4404, + "step": 1676 + }, + { + "epoch": 1.7746031746031745, + "grad_norm": 0.24472248169409241, + "learning_rate": 2.2716581732653863e-05, + "loss": 0.4276, + "step": 1677 + }, + { + "epoch": 1.7756613756613757, + "grad_norm": 0.49844277498031275, + "learning_rate": 2.269698157585261e-05, + "loss": 0.3956, + "step": 1678 + }, + { + "epoch": 1.7767195767195767, + "grad_norm": 0.228348531852297, + "learning_rate": 2.2677381419051352e-05, + "loss": 0.4511, + "step": 1679 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.2886498501182977, + "learning_rate": 2.26577812622501e-05, + "loss": 0.4456, + "step": 1680 + }, + { + "epoch": 1.7788359788359789, + "grad_norm": 4.492214718550815, + "learning_rate": 2.2638181105448844e-05, + "loss": 0.5011, + "step": 1681 + }, + { + "epoch": 1.7798941798941799, + "grad_norm": 0.24204760220136187, + "learning_rate": 2.2618580948647593e-05, + "loss": 0.4279, + "step": 1682 + }, + { + "epoch": 1.7809523809523808, + "grad_norm": 0.23756281310500446, + "learning_rate": 2.2598980791846335e-05, + "loss": 0.4534, + "step": 1683 + }, + { + "epoch": 1.782010582010582, + "grad_norm": 0.21980783401447584, + "learning_rate": 2.257938063504508e-05, + "loss": 0.4446, + "step": 1684 + }, + { + "epoch": 1.783068783068783, + "grad_norm": 0.2087115413343064, + "learning_rate": 2.2559780478243827e-05, + "loss": 0.3605, + "step": 1685 + }, + { + "epoch": 1.784126984126984, + "grad_norm": 0.2543465932738077, + "learning_rate": 2.2540180321442573e-05, + "loss": 0.4842, + "step": 1686 + }, + { + "epoch": 1.7851851851851852, + "grad_norm": 0.22912923330057322, + "learning_rate": 2.252058016464132e-05, + "loss": 0.486, + "step": 1687 + }, + { + "epoch": 1.7862433862433862, + "grad_norm": 0.21781329156771057, + "learning_rate": 2.2500980007840065e-05, + "loss": 0.4683, + "step": 1688 + }, + { + "epoch": 1.7873015873015872, + "grad_norm": 0.21952765550999895, + "learning_rate": 2.2481379851038807e-05, + "loss": 0.449, + "step": 1689 + }, + { + "epoch": 1.7883597883597884, + "grad_norm": 0.2419599965780607, + "learning_rate": 2.2461779694237556e-05, + "loss": 0.5227, + "step": 1690 + }, + { + "epoch": 1.7894179894179896, + "grad_norm": 0.23359674854150045, + "learning_rate": 2.24421795374363e-05, + "loss": 0.4485, + "step": 1691 + }, + { + "epoch": 1.7904761904761903, + "grad_norm": 0.21983049948624178, + "learning_rate": 2.2422579380635045e-05, + "loss": 0.4553, + "step": 1692 + }, + { + "epoch": 1.7915343915343915, + "grad_norm": 0.22263523405847083, + "learning_rate": 2.240297922383379e-05, + "loss": 0.4627, + "step": 1693 + }, + { + "epoch": 1.7925925925925927, + "grad_norm": 0.21875294678933027, + "learning_rate": 2.2383379067032536e-05, + "loss": 0.452, + "step": 1694 + }, + { + "epoch": 1.7936507936507935, + "grad_norm": 0.2190892904324516, + "learning_rate": 2.2363778910231282e-05, + "loss": 0.4176, + "step": 1695 + }, + { + "epoch": 1.7947089947089947, + "grad_norm": 0.20100867962420854, + "learning_rate": 2.2344178753430028e-05, + "loss": 0.4424, + "step": 1696 + }, + { + "epoch": 1.795767195767196, + "grad_norm": 0.22957554331685712, + "learning_rate": 2.2324578596628774e-05, + "loss": 0.471, + "step": 1697 + }, + { + "epoch": 1.7968253968253967, + "grad_norm": 0.22879161164574066, + "learning_rate": 2.230497843982752e-05, + "loss": 0.497, + "step": 1698 + }, + { + "epoch": 1.7978835978835979, + "grad_norm": 0.19578829621547492, + "learning_rate": 2.2285378283026266e-05, + "loss": 0.3854, + "step": 1699 + }, + { + "epoch": 1.798941798941799, + "grad_norm": 0.21006387291689507, + "learning_rate": 2.2265778126225012e-05, + "loss": 0.4098, + "step": 1700 + }, + { + "epoch": 1.8, + "grad_norm": 0.23977296269604864, + "learning_rate": 2.2246177969423758e-05, + "loss": 0.4905, + "step": 1701 + }, + { + "epoch": 1.801058201058201, + "grad_norm": 0.2105622462139618, + "learning_rate": 2.22265778126225e-05, + "loss": 0.4283, + "step": 1702 + }, + { + "epoch": 1.8021164021164022, + "grad_norm": 0.22056822514598742, + "learning_rate": 2.220697765582125e-05, + "loss": 0.4205, + "step": 1703 + }, + { + "epoch": 1.8031746031746032, + "grad_norm": 0.20582405401017348, + "learning_rate": 2.2187377499019992e-05, + "loss": 0.4471, + "step": 1704 + }, + { + "epoch": 1.8042328042328042, + "grad_norm": 0.24182764269625914, + "learning_rate": 2.216777734221874e-05, + "loss": 0.4553, + "step": 1705 + }, + { + "epoch": 1.8052910052910054, + "grad_norm": 0.20679090700136382, + "learning_rate": 2.2148177185417484e-05, + "loss": 0.451, + "step": 1706 + }, + { + "epoch": 1.8063492063492064, + "grad_norm": 0.19092027693402705, + "learning_rate": 2.212857702861623e-05, + "loss": 0.3698, + "step": 1707 + }, + { + "epoch": 1.8074074074074074, + "grad_norm": 0.23205314819347692, + "learning_rate": 2.2108976871814975e-05, + "loss": 0.4511, + "step": 1708 + }, + { + "epoch": 1.8084656084656086, + "grad_norm": 0.21244419197626072, + "learning_rate": 2.208937671501372e-05, + "loss": 0.426, + "step": 1709 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.22435395018849072, + "learning_rate": 2.2069776558212467e-05, + "loss": 0.4626, + "step": 1710 + }, + { + "epoch": 1.8105820105820105, + "grad_norm": 0.24022212831540976, + "learning_rate": 2.2050176401411213e-05, + "loss": 0.4383, + "step": 1711 + }, + { + "epoch": 1.8116402116402117, + "grad_norm": 0.20705066488075852, + "learning_rate": 2.203057624460996e-05, + "loss": 0.4122, + "step": 1712 + }, + { + "epoch": 1.8126984126984127, + "grad_norm": 0.22592755674231862, + "learning_rate": 2.2010976087808705e-05, + "loss": 0.4842, + "step": 1713 + }, + { + "epoch": 1.8137566137566137, + "grad_norm": 0.21271788549637888, + "learning_rate": 2.199137593100745e-05, + "loss": 0.4389, + "step": 1714 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 0.23889518702932439, + "learning_rate": 2.1971775774206193e-05, + "loss": 0.4717, + "step": 1715 + }, + { + "epoch": 1.8158730158730159, + "grad_norm": 0.2069283317504962, + "learning_rate": 2.1952175617404942e-05, + "loss": 0.4386, + "step": 1716 + }, + { + "epoch": 1.8169312169312168, + "grad_norm": 0.20579833750263932, + "learning_rate": 2.1932575460603685e-05, + "loss": 0.4147, + "step": 1717 + }, + { + "epoch": 1.817989417989418, + "grad_norm": 0.2249576933304194, + "learning_rate": 2.1912975303802434e-05, + "loss": 0.4128, + "step": 1718 + }, + { + "epoch": 1.819047619047619, + "grad_norm": 0.24258567681504786, + "learning_rate": 2.1893375147001177e-05, + "loss": 0.496, + "step": 1719 + }, + { + "epoch": 1.82010582010582, + "grad_norm": 2.4445652502189414, + "learning_rate": 2.1873774990199922e-05, + "loss": 0.5013, + "step": 1720 + }, + { + "epoch": 1.8211640211640212, + "grad_norm": 0.22937999540510728, + "learning_rate": 2.185417483339867e-05, + "loss": 0.458, + "step": 1721 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.22188766496336873, + "learning_rate": 2.1834574676597414e-05, + "loss": 0.4118, + "step": 1722 + }, + { + "epoch": 1.8232804232804232, + "grad_norm": 0.20985186497122293, + "learning_rate": 2.1814974519796157e-05, + "loss": 0.4202, + "step": 1723 + }, + { + "epoch": 1.8243386243386244, + "grad_norm": 5.620824290236189, + "learning_rate": 2.1795374362994906e-05, + "loss": 0.6836, + "step": 1724 + }, + { + "epoch": 1.8253968253968254, + "grad_norm": 0.2732860325354266, + "learning_rate": 2.177577420619365e-05, + "loss": 0.5006, + "step": 1725 + }, + { + "epoch": 1.8264550264550263, + "grad_norm": 0.2518168942236389, + "learning_rate": 2.1756174049392398e-05, + "loss": 0.4803, + "step": 1726 + }, + { + "epoch": 1.8275132275132275, + "grad_norm": 0.30831471115695636, + "learning_rate": 2.173657389259114e-05, + "loss": 0.4059, + "step": 1727 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.2518548837185105, + "learning_rate": 2.1716973735789886e-05, + "loss": 0.4423, + "step": 1728 + }, + { + "epoch": 1.8296296296296295, + "grad_norm": 0.23578369543941782, + "learning_rate": 2.1697373578988632e-05, + "loss": 0.4302, + "step": 1729 + }, + { + "epoch": 1.8306878306878307, + "grad_norm": 0.2417044417728648, + "learning_rate": 2.1677773422187378e-05, + "loss": 0.4478, + "step": 1730 + }, + { + "epoch": 1.831746031746032, + "grad_norm": 0.21978275736190653, + "learning_rate": 2.1658173265386124e-05, + "loss": 0.4316, + "step": 1731 + }, + { + "epoch": 1.8328042328042327, + "grad_norm": 0.22025067994263706, + "learning_rate": 2.163857310858487e-05, + "loss": 0.4013, + "step": 1732 + }, + { + "epoch": 1.8338624338624339, + "grad_norm": 0.23323951970705392, + "learning_rate": 2.1618972951783615e-05, + "loss": 0.4603, + "step": 1733 + }, + { + "epoch": 1.834920634920635, + "grad_norm": 0.2120845002669665, + "learning_rate": 2.159937279498236e-05, + "loss": 0.4597, + "step": 1734 + }, + { + "epoch": 1.8359788359788358, + "grad_norm": 0.22367193469028424, + "learning_rate": 2.1579772638181107e-05, + "loss": 0.4438, + "step": 1735 + }, + { + "epoch": 1.837037037037037, + "grad_norm": 0.21769442259199298, + "learning_rate": 2.1560172481379853e-05, + "loss": 0.4325, + "step": 1736 + }, + { + "epoch": 1.8380952380952382, + "grad_norm": 0.2248745754640526, + "learning_rate": 2.15405723245786e-05, + "loss": 0.432, + "step": 1737 + }, + { + "epoch": 1.8391534391534392, + "grad_norm": 0.21085112873884912, + "learning_rate": 2.152097216777734e-05, + "loss": 0.4293, + "step": 1738 + }, + { + "epoch": 1.8402116402116402, + "grad_norm": 0.20872522105623528, + "learning_rate": 2.150137201097609e-05, + "loss": 0.4546, + "step": 1739 + }, + { + "epoch": 1.8412698412698414, + "grad_norm": 0.2285263541285715, + "learning_rate": 2.1481771854174833e-05, + "loss": 0.4051, + "step": 1740 + }, + { + "epoch": 1.8423280423280424, + "grad_norm": 0.22784803646681845, + "learning_rate": 2.146217169737358e-05, + "loss": 0.446, + "step": 1741 + }, + { + "epoch": 1.8433862433862434, + "grad_norm": 0.26036748975604307, + "learning_rate": 2.1442571540572325e-05, + "loss": 0.4154, + "step": 1742 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.2387042021248444, + "learning_rate": 2.142297138377107e-05, + "loss": 0.4309, + "step": 1743 + }, + { + "epoch": 1.8455026455026455, + "grad_norm": 0.2182840280137568, + "learning_rate": 2.1403371226969817e-05, + "loss": 0.4367, + "step": 1744 + }, + { + "epoch": 1.8465608465608465, + "grad_norm": 0.22907755338440494, + "learning_rate": 2.1383771070168563e-05, + "loss": 0.4255, + "step": 1745 + }, + { + "epoch": 1.8476190476190477, + "grad_norm": 0.20993653159537567, + "learning_rate": 2.136417091336731e-05, + "loss": 0.4389, + "step": 1746 + }, + { + "epoch": 1.8486772486772487, + "grad_norm": 0.21338544578060106, + "learning_rate": 2.1344570756566054e-05, + "loss": 0.461, + "step": 1747 + }, + { + "epoch": 1.8497354497354497, + "grad_norm": 0.21341559351141076, + "learning_rate": 2.13249705997648e-05, + "loss": 0.4839, + "step": 1748 + }, + { + "epoch": 1.8507936507936509, + "grad_norm": 0.24079068460679487, + "learning_rate": 2.1305370442963546e-05, + "loss": 0.4823, + "step": 1749 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.20444170286570748, + "learning_rate": 2.1285770286162292e-05, + "loss": 0.4461, + "step": 1750 + }, + { + "epoch": 1.8529100529100528, + "grad_norm": 0.2387410951821536, + "learning_rate": 2.1266170129361034e-05, + "loss": 0.5042, + "step": 1751 + }, + { + "epoch": 1.853968253968254, + "grad_norm": 0.19937829720047517, + "learning_rate": 2.1246569972559784e-05, + "loss": 0.3733, + "step": 1752 + }, + { + "epoch": 1.855026455026455, + "grad_norm": 0.20349986087488134, + "learning_rate": 2.1226969815758526e-05, + "loss": 0.4192, + "step": 1753 + }, + { + "epoch": 1.856084656084656, + "grad_norm": 0.19942778332550556, + "learning_rate": 2.1207369658957275e-05, + "loss": 0.4123, + "step": 1754 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.2395297045424899, + "learning_rate": 2.1187769502156018e-05, + "loss": 0.4858, + "step": 1755 + }, + { + "epoch": 1.8582010582010582, + "grad_norm": 0.20249034937090846, + "learning_rate": 2.1168169345354764e-05, + "loss": 0.4741, + "step": 1756 + }, + { + "epoch": 1.8592592592592592, + "grad_norm": 0.22048497676461043, + "learning_rate": 2.114856918855351e-05, + "loss": 0.45, + "step": 1757 + }, + { + "epoch": 1.8603174603174604, + "grad_norm": 0.22049360912083787, + "learning_rate": 2.1128969031752256e-05, + "loss": 0.5033, + "step": 1758 + }, + { + "epoch": 1.8613756613756614, + "grad_norm": 0.22074914003644552, + "learning_rate": 2.1109368874950998e-05, + "loss": 0.4399, + "step": 1759 + }, + { + "epoch": 1.8624338624338623, + "grad_norm": 0.2160312595040178, + "learning_rate": 2.1089768718149747e-05, + "loss": 0.4702, + "step": 1760 + }, + { + "epoch": 1.8634920634920635, + "grad_norm": 0.206865371111694, + "learning_rate": 2.107016856134849e-05, + "loss": 0.4652, + "step": 1761 + }, + { + "epoch": 1.8645502645502645, + "grad_norm": 0.2091592072021074, + "learning_rate": 2.105056840454724e-05, + "loss": 0.4182, + "step": 1762 + }, + { + "epoch": 1.8656084656084655, + "grad_norm": 0.23339044723826416, + "learning_rate": 2.103096824774598e-05, + "loss": 0.5277, + "step": 1763 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.4460094384942643, + "learning_rate": 2.1011368090944727e-05, + "loss": 0.4579, + "step": 1764 + }, + { + "epoch": 1.8677248677248677, + "grad_norm": 0.2100939143704434, + "learning_rate": 2.0991767934143473e-05, + "loss": 0.4655, + "step": 1765 + }, + { + "epoch": 1.8687830687830687, + "grad_norm": 0.23434130765094305, + "learning_rate": 2.097216777734222e-05, + "loss": 0.5209, + "step": 1766 + }, + { + "epoch": 1.8698412698412699, + "grad_norm": 0.2434629762626319, + "learning_rate": 2.0952567620540965e-05, + "loss": 0.4495, + "step": 1767 + }, + { + "epoch": 1.870899470899471, + "grad_norm": 0.21781894326153597, + "learning_rate": 2.093296746373971e-05, + "loss": 0.4462, + "step": 1768 + }, + { + "epoch": 1.8719576719576718, + "grad_norm": 0.20899498020213622, + "learning_rate": 2.0913367306938457e-05, + "loss": 0.4457, + "step": 1769 + }, + { + "epoch": 1.873015873015873, + "grad_norm": 0.2167051331331649, + "learning_rate": 2.0893767150137203e-05, + "loss": 0.4375, + "step": 1770 + }, + { + "epoch": 1.8740740740740742, + "grad_norm": 0.2240331588887899, + "learning_rate": 2.087416699333595e-05, + "loss": 0.5185, + "step": 1771 + }, + { + "epoch": 1.875132275132275, + "grad_norm": 0.22090206678208632, + "learning_rate": 2.085456683653469e-05, + "loss": 0.4808, + "step": 1772 + }, + { + "epoch": 1.8761904761904762, + "grad_norm": 0.22040696733634577, + "learning_rate": 2.083496667973344e-05, + "loss": 0.456, + "step": 1773 + }, + { + "epoch": 1.8772486772486774, + "grad_norm": 0.2079955864538246, + "learning_rate": 2.0815366522932183e-05, + "loss": 0.4143, + "step": 1774 + }, + { + "epoch": 1.8783068783068781, + "grad_norm": 0.22169752369395546, + "learning_rate": 2.0795766366130932e-05, + "loss": 0.4858, + "step": 1775 + }, + { + "epoch": 1.8793650793650793, + "grad_norm": 0.2402070315956562, + "learning_rate": 2.0776166209329674e-05, + "loss": 0.4504, + "step": 1776 + }, + { + "epoch": 1.8804232804232806, + "grad_norm": 0.23435927645511864, + "learning_rate": 2.075656605252842e-05, + "loss": 0.4593, + "step": 1777 + }, + { + "epoch": 1.8814814814814815, + "grad_norm": 0.20818590974484366, + "learning_rate": 2.0736965895727166e-05, + "loss": 0.4381, + "step": 1778 + }, + { + "epoch": 1.8825396825396825, + "grad_norm": 0.2130354907179979, + "learning_rate": 2.0717365738925912e-05, + "loss": 0.4297, + "step": 1779 + }, + { + "epoch": 1.8835978835978837, + "grad_norm": 0.21660044821589397, + "learning_rate": 2.0697765582124658e-05, + "loss": 0.4697, + "step": 1780 + }, + { + "epoch": 1.8846560846560847, + "grad_norm": 0.20309183557121804, + "learning_rate": 2.0678165425323404e-05, + "loss": 0.4025, + "step": 1781 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.21087610186232195, + "learning_rate": 2.065856526852215e-05, + "loss": 0.4546, + "step": 1782 + }, + { + "epoch": 1.8867724867724869, + "grad_norm": 0.22732457303926343, + "learning_rate": 2.0638965111720896e-05, + "loss": 0.4668, + "step": 1783 + }, + { + "epoch": 1.8878306878306879, + "grad_norm": 0.22180452040864543, + "learning_rate": 2.061936495491964e-05, + "loss": 0.4584, + "step": 1784 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.21261990698971484, + "learning_rate": 2.0599764798118387e-05, + "loss": 0.4406, + "step": 1785 + }, + { + "epoch": 1.88994708994709, + "grad_norm": 0.21071003692457904, + "learning_rate": 2.0580164641317133e-05, + "loss": 0.4143, + "step": 1786 + }, + { + "epoch": 1.891005291005291, + "grad_norm": 0.2204518775713819, + "learning_rate": 2.0560564484515876e-05, + "loss": 0.4135, + "step": 1787 + }, + { + "epoch": 1.892063492063492, + "grad_norm": 0.2078948195319659, + "learning_rate": 2.0540964327714625e-05, + "loss": 0.3921, + "step": 1788 + }, + { + "epoch": 1.8931216931216932, + "grad_norm": 0.2196279086951217, + "learning_rate": 2.0521364170913367e-05, + "loss": 0.3873, + "step": 1789 + }, + { + "epoch": 1.8941798941798942, + "grad_norm": 0.24347249045633615, + "learning_rate": 2.0501764014112113e-05, + "loss": 0.4223, + "step": 1790 + }, + { + "epoch": 1.8952380952380952, + "grad_norm": 0.21502367020868784, + "learning_rate": 2.048216385731086e-05, + "loss": 0.4241, + "step": 1791 + }, + { + "epoch": 1.8962962962962964, + "grad_norm": 0.2223665065572327, + "learning_rate": 2.0462563700509605e-05, + "loss": 0.4585, + "step": 1792 + }, + { + "epoch": 1.8973544973544973, + "grad_norm": 0.2210410067693124, + "learning_rate": 2.044296354370835e-05, + "loss": 0.4468, + "step": 1793 + }, + { + "epoch": 1.8984126984126983, + "grad_norm": 0.21557122963124056, + "learning_rate": 2.0423363386907097e-05, + "loss": 0.458, + "step": 1794 + }, + { + "epoch": 1.8994708994708995, + "grad_norm": 0.24519100116436493, + "learning_rate": 2.040376323010584e-05, + "loss": 0.4466, + "step": 1795 + }, + { + "epoch": 1.9005291005291005, + "grad_norm": 0.24801432016192543, + "learning_rate": 2.038416307330459e-05, + "loss": 0.4839, + "step": 1796 + }, + { + "epoch": 1.9015873015873015, + "grad_norm": 0.20049577359290413, + "learning_rate": 2.036456291650333e-05, + "loss": 0.4127, + "step": 1797 + }, + { + "epoch": 1.9026455026455027, + "grad_norm": 0.23520752147502938, + "learning_rate": 2.034496275970208e-05, + "loss": 0.448, + "step": 1798 + }, + { + "epoch": 1.9037037037037037, + "grad_norm": 0.24490985101151913, + "learning_rate": 2.0325362602900823e-05, + "loss": 0.431, + "step": 1799 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.2664715652435904, + "learning_rate": 2.030576244609957e-05, + "loss": 0.4766, + "step": 1800 + }, + { + "epoch": 1.9058201058201059, + "grad_norm": 0.2485082923716217, + "learning_rate": 2.0286162289298315e-05, + "loss": 0.573, + "step": 1801 + }, + { + "epoch": 1.9068783068783068, + "grad_norm": 0.24251809787369136, + "learning_rate": 2.026656213249706e-05, + "loss": 0.4245, + "step": 1802 + }, + { + "epoch": 1.9079365079365078, + "grad_norm": 0.27387491257447555, + "learning_rate": 2.0246961975695806e-05, + "loss": 0.4171, + "step": 1803 + }, + { + "epoch": 1.908994708994709, + "grad_norm": 0.23047285152697722, + "learning_rate": 2.0227361818894552e-05, + "loss": 0.493, + "step": 1804 + }, + { + "epoch": 1.91005291005291, + "grad_norm": 7.990646854444911, + "learning_rate": 2.0207761662093298e-05, + "loss": 0.635, + "step": 1805 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.26163599966519524, + "learning_rate": 2.0188161505292044e-05, + "loss": 0.4351, + "step": 1806 + }, + { + "epoch": 1.9121693121693122, + "grad_norm": 0.25130856546308006, + "learning_rate": 2.016856134849079e-05, + "loss": 0.4399, + "step": 1807 + }, + { + "epoch": 1.9132275132275134, + "grad_norm": 0.20476985393616431, + "learning_rate": 2.0148961191689532e-05, + "loss": 0.4158, + "step": 1808 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.22396937696533817, + "learning_rate": 2.012936103488828e-05, + "loss": 0.4953, + "step": 1809 + }, + { + "epoch": 1.9153439153439153, + "grad_norm": 0.2100685359464842, + "learning_rate": 2.0109760878087024e-05, + "loss": 0.4604, + "step": 1810 + }, + { + "epoch": 1.9164021164021166, + "grad_norm": 0.2347068667100795, + "learning_rate": 2.0090160721285773e-05, + "loss": 0.4656, + "step": 1811 + }, + { + "epoch": 1.9174603174603173, + "grad_norm": 0.2134340622258882, + "learning_rate": 2.0070560564484516e-05, + "loss": 0.4155, + "step": 1812 + }, + { + "epoch": 1.9185185185185185, + "grad_norm": 0.20915306386821994, + "learning_rate": 2.005096040768326e-05, + "loss": 0.4005, + "step": 1813 + }, + { + "epoch": 1.9195767195767197, + "grad_norm": 0.2261359347456008, + "learning_rate": 2.0031360250882008e-05, + "loss": 0.4248, + "step": 1814 + }, + { + "epoch": 1.9206349206349205, + "grad_norm": 0.21813479888223342, + "learning_rate": 2.0011760094080753e-05, + "loss": 0.4638, + "step": 1815 + }, + { + "epoch": 1.9216931216931217, + "grad_norm": 0.2101057981170796, + "learning_rate": 1.99921599372795e-05, + "loss": 0.4028, + "step": 1816 + }, + { + "epoch": 1.9227513227513229, + "grad_norm": 0.22511459770933143, + "learning_rate": 1.9972559780478245e-05, + "loss": 0.5111, + "step": 1817 + }, + { + "epoch": 1.9238095238095239, + "grad_norm": 0.23388911916732105, + "learning_rate": 1.995295962367699e-05, + "loss": 0.4683, + "step": 1818 + }, + { + "epoch": 1.9248677248677248, + "grad_norm": 0.22435740200537527, + "learning_rate": 1.9933359466875737e-05, + "loss": 0.438, + "step": 1819 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 0.21611950819751236, + "learning_rate": 1.9913759310074483e-05, + "loss": 0.378, + "step": 1820 + }, + { + "epoch": 1.926984126984127, + "grad_norm": 0.20291845845078743, + "learning_rate": 1.9894159153273225e-05, + "loss": 0.4277, + "step": 1821 + }, + { + "epoch": 1.928042328042328, + "grad_norm": 0.22847172320277195, + "learning_rate": 1.9874558996471975e-05, + "loss": 0.4674, + "step": 1822 + }, + { + "epoch": 1.9291005291005292, + "grad_norm": 0.23053558937849467, + "learning_rate": 1.9854958839670717e-05, + "loss": 0.4015, + "step": 1823 + }, + { + "epoch": 1.9301587301587302, + "grad_norm": 0.23708550014377422, + "learning_rate": 1.9835358682869466e-05, + "loss": 0.4684, + "step": 1824 + }, + { + "epoch": 1.9312169312169312, + "grad_norm": 0.316817125246121, + "learning_rate": 1.981575852606821e-05, + "loss": 0.4263, + "step": 1825 + }, + { + "epoch": 1.9322751322751324, + "grad_norm": 0.2254829079471954, + "learning_rate": 1.9796158369266955e-05, + "loss": 0.4851, + "step": 1826 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.2316587670641986, + "learning_rate": 1.97765582124657e-05, + "loss": 0.4229, + "step": 1827 + }, + { + "epoch": 1.9343915343915343, + "grad_norm": 0.21297157984215134, + "learning_rate": 1.9756958055664446e-05, + "loss": 0.4415, + "step": 1828 + }, + { + "epoch": 1.9354497354497355, + "grad_norm": 0.2612957037749709, + "learning_rate": 1.9737357898863192e-05, + "loss": 0.4589, + "step": 1829 + }, + { + "epoch": 1.9365079365079365, + "grad_norm": 0.2129663181734509, + "learning_rate": 1.9717757742061938e-05, + "loss": 0.4511, + "step": 1830 + }, + { + "epoch": 1.9375661375661375, + "grad_norm": 0.23503195721500833, + "learning_rate": 1.969815758526068e-05, + "loss": 0.4617, + "step": 1831 + }, + { + "epoch": 1.9386243386243387, + "grad_norm": 0.2287134737206407, + "learning_rate": 1.967855742845943e-05, + "loss": 0.471, + "step": 1832 + }, + { + "epoch": 1.9396825396825397, + "grad_norm": 0.21262604276669486, + "learning_rate": 1.9658957271658172e-05, + "loss": 0.47, + "step": 1833 + }, + { + "epoch": 1.9407407407407407, + "grad_norm": 0.220344769052406, + "learning_rate": 1.963935711485692e-05, + "loss": 0.4505, + "step": 1834 + }, + { + "epoch": 1.9417989417989419, + "grad_norm": 0.220152899947779, + "learning_rate": 1.9619756958055664e-05, + "loss": 0.4263, + "step": 1835 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.20366436651004108, + "learning_rate": 1.960015680125441e-05, + "loss": 0.3771, + "step": 1836 + }, + { + "epoch": 1.9439153439153438, + "grad_norm": 0.2200894429401924, + "learning_rate": 1.9580556644453156e-05, + "loss": 0.4203, + "step": 1837 + }, + { + "epoch": 1.944973544973545, + "grad_norm": 0.21590610062905347, + "learning_rate": 1.9560956487651902e-05, + "loss": 0.4558, + "step": 1838 + }, + { + "epoch": 1.946031746031746, + "grad_norm": 0.22356270505912842, + "learning_rate": 1.9541356330850648e-05, + "loss": 0.4233, + "step": 1839 + }, + { + "epoch": 1.947089947089947, + "grad_norm": 0.20737807332328057, + "learning_rate": 1.9521756174049394e-05, + "loss": 0.3993, + "step": 1840 + }, + { + "epoch": 1.9481481481481482, + "grad_norm": 0.21342097386972908, + "learning_rate": 1.950215601724814e-05, + "loss": 0.4127, + "step": 1841 + }, + { + "epoch": 1.9492063492063492, + "grad_norm": 0.2031800844818702, + "learning_rate": 1.9482555860446885e-05, + "loss": 0.383, + "step": 1842 + }, + { + "epoch": 1.9502645502645501, + "grad_norm": 0.22744285466424116, + "learning_rate": 1.946295570364563e-05, + "loss": 0.4748, + "step": 1843 + }, + { + "epoch": 1.9513227513227513, + "grad_norm": 0.23141003857099457, + "learning_rate": 1.9443355546844374e-05, + "loss": 0.5051, + "step": 1844 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.2145215316356645, + "learning_rate": 1.9423755390043123e-05, + "loss": 0.4225, + "step": 1845 + }, + { + "epoch": 1.9534391534391533, + "grad_norm": 0.2026905601029007, + "learning_rate": 1.9404155233241865e-05, + "loss": 0.4007, + "step": 1846 + }, + { + "epoch": 1.9544973544973545, + "grad_norm": 0.22999532724274202, + "learning_rate": 1.9384555076440615e-05, + "loss": 0.4617, + "step": 1847 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.2082995676826002, + "learning_rate": 1.9364954919639357e-05, + "loss": 0.4278, + "step": 1848 + }, + { + "epoch": 1.9566137566137565, + "grad_norm": 0.22475386311924242, + "learning_rate": 1.9345354762838103e-05, + "loss": 0.4241, + "step": 1849 + }, + { + "epoch": 1.9576719576719577, + "grad_norm": 0.22479359963728507, + "learning_rate": 1.932575460603685e-05, + "loss": 0.4338, + "step": 1850 + }, + { + "epoch": 1.9587301587301589, + "grad_norm": 0.20367965304182695, + "learning_rate": 1.9306154449235595e-05, + "loss": 0.3818, + "step": 1851 + }, + { + "epoch": 1.9597883597883596, + "grad_norm": 0.21527243201486918, + "learning_rate": 1.928655429243434e-05, + "loss": 0.3774, + "step": 1852 + }, + { + "epoch": 1.9608465608465608, + "grad_norm": 0.2249500078225003, + "learning_rate": 1.9266954135633087e-05, + "loss": 0.4967, + "step": 1853 + }, + { + "epoch": 1.961904761904762, + "grad_norm": 0.21710410127581037, + "learning_rate": 1.9247353978831832e-05, + "loss": 0.456, + "step": 1854 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 0.21285160925599858, + "learning_rate": 1.9227753822030578e-05, + "loss": 0.4345, + "step": 1855 + }, + { + "epoch": 1.964021164021164, + "grad_norm": 0.19869936479793823, + "learning_rate": 1.9208153665229324e-05, + "loss": 0.4267, + "step": 1856 + }, + { + "epoch": 1.9650793650793652, + "grad_norm": 0.20189396892728878, + "learning_rate": 1.9188553508428067e-05, + "loss": 0.4246, + "step": 1857 + }, + { + "epoch": 1.9661375661375662, + "grad_norm": 0.21026152529101638, + "learning_rate": 1.9168953351626816e-05, + "loss": 0.4074, + "step": 1858 + }, + { + "epoch": 1.9671957671957672, + "grad_norm": 0.2143954764423591, + "learning_rate": 1.914935319482556e-05, + "loss": 0.4739, + "step": 1859 + }, + { + "epoch": 1.9682539682539684, + "grad_norm": 0.19350444830703373, + "learning_rate": 1.9129753038024308e-05, + "loss": 0.3873, + "step": 1860 + }, + { + "epoch": 1.9693121693121693, + "grad_norm": 0.19855752381553382, + "learning_rate": 1.911015288122305e-05, + "loss": 0.4127, + "step": 1861 + }, + { + "epoch": 1.9703703703703703, + "grad_norm": 0.21567120927747255, + "learning_rate": 1.9090552724421796e-05, + "loss": 0.4648, + "step": 1862 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 0.21153824863129872, + "learning_rate": 1.9070952567620542e-05, + "loss": 0.4796, + "step": 1863 + }, + { + "epoch": 1.9724867724867725, + "grad_norm": 0.19320130815422823, + "learning_rate": 1.9051352410819288e-05, + "loss": 0.4275, + "step": 1864 + }, + { + "epoch": 1.9735449735449735, + "grad_norm": 0.19887236706758227, + "learning_rate": 1.9031752254018034e-05, + "loss": 0.4148, + "step": 1865 + }, + { + "epoch": 1.9746031746031747, + "grad_norm": 0.1932406080084438, + "learning_rate": 1.901215209721678e-05, + "loss": 0.4067, + "step": 1866 + }, + { + "epoch": 1.9756613756613757, + "grad_norm": 0.21807163033247312, + "learning_rate": 1.8992551940415522e-05, + "loss": 0.4882, + "step": 1867 + }, + { + "epoch": 1.9767195767195767, + "grad_norm": 0.20618559739939532, + "learning_rate": 1.897295178361427e-05, + "loss": 0.4276, + "step": 1868 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.19090448828455922, + "learning_rate": 1.8953351626813014e-05, + "loss": 0.4043, + "step": 1869 + }, + { + "epoch": 1.9788359788359788, + "grad_norm": 0.1947353451525142, + "learning_rate": 1.893375147001176e-05, + "loss": 0.4039, + "step": 1870 + }, + { + "epoch": 1.9798941798941798, + "grad_norm": 0.22563886214800088, + "learning_rate": 1.8914151313210505e-05, + "loss": 0.4928, + "step": 1871 + }, + { + "epoch": 1.980952380952381, + "grad_norm": 0.24654476642054673, + "learning_rate": 1.889455115640925e-05, + "loss": 0.5016, + "step": 1872 + }, + { + "epoch": 1.982010582010582, + "grad_norm": 0.21770134373827096, + "learning_rate": 1.8874950999607997e-05, + "loss": 0.4636, + "step": 1873 + }, + { + "epoch": 1.983068783068783, + "grad_norm": 0.21078921223109962, + "learning_rate": 1.8855350842806743e-05, + "loss": 0.4529, + "step": 1874 + }, + { + "epoch": 1.9841269841269842, + "grad_norm": 0.2042699274002301, + "learning_rate": 1.883575068600549e-05, + "loss": 0.4443, + "step": 1875 + }, + { + "epoch": 1.9851851851851852, + "grad_norm": 0.20988400394066842, + "learning_rate": 1.8816150529204235e-05, + "loss": 0.4441, + "step": 1876 + }, + { + "epoch": 1.9862433862433861, + "grad_norm": 0.231811845633082, + "learning_rate": 1.879655037240298e-05, + "loss": 0.4538, + "step": 1877 + }, + { + "epoch": 1.9873015873015873, + "grad_norm": 0.19558466017276782, + "learning_rate": 1.8776950215601727e-05, + "loss": 0.4031, + "step": 1878 + }, + { + "epoch": 1.9883597883597883, + "grad_norm": 0.20830804273827055, + "learning_rate": 1.8757350058800472e-05, + "loss": 0.454, + "step": 1879 + }, + { + "epoch": 1.9894179894179893, + "grad_norm": 0.2316902360278231, + "learning_rate": 1.8737749901999215e-05, + "loss": 0.4099, + "step": 1880 + }, + { + "epoch": 1.9904761904761905, + "grad_norm": 0.6294053779165485, + "learning_rate": 1.8718149745197964e-05, + "loss": 0.5239, + "step": 1881 + }, + { + "epoch": 1.9915343915343915, + "grad_norm": 0.1914593214711506, + "learning_rate": 1.8698549588396707e-05, + "loss": 0.3855, + "step": 1882 + }, + { + "epoch": 1.9925925925925925, + "grad_norm": 0.21642178461292894, + "learning_rate": 1.8678949431595456e-05, + "loss": 0.467, + "step": 1883 + }, + { + "epoch": 1.9936507936507937, + "grad_norm": 0.23743786345884751, + "learning_rate": 1.86593492747942e-05, + "loss": 0.4487, + "step": 1884 + }, + { + "epoch": 1.9947089947089947, + "grad_norm": 0.23198695415553083, + "learning_rate": 1.8639749117992944e-05, + "loss": 0.5057, + "step": 1885 + }, + { + "epoch": 1.9957671957671956, + "grad_norm": 0.3471270912766888, + "learning_rate": 1.862014896119169e-05, + "loss": 0.3937, + "step": 1886 + }, + { + "epoch": 1.9968253968253968, + "grad_norm": 0.22576252798533672, + "learning_rate": 1.8600548804390436e-05, + "loss": 0.4228, + "step": 1887 + }, + { + "epoch": 1.997883597883598, + "grad_norm": 0.21560388020282445, + "learning_rate": 1.8580948647589182e-05, + "loss": 0.4147, + "step": 1888 + }, + { + "epoch": 1.9989417989417988, + "grad_norm": 0.22358325111497787, + "learning_rate": 1.8561348490787928e-05, + "loss": 0.4384, + "step": 1889 + }, + { + "epoch": 2.0, + "grad_norm": 0.22079447072147176, + "learning_rate": 1.8541748333986674e-05, + "loss": 0.4229, + "step": 1890 + }, + { + "epoch": 2.001058201058201, + "grad_norm": 0.298496877542824, + "learning_rate": 1.852214817718542e-05, + "loss": 0.3341, + "step": 1891 + }, + { + "epoch": 2.002116402116402, + "grad_norm": 0.2769200047722302, + "learning_rate": 1.8502548020384165e-05, + "loss": 0.3833, + "step": 1892 + }, + { + "epoch": 2.003174603174603, + "grad_norm": 0.29171539792667067, + "learning_rate": 1.8482947863582908e-05, + "loss": 0.3858, + "step": 1893 + }, + { + "epoch": 2.0042328042328044, + "grad_norm": 0.2789134735368325, + "learning_rate": 1.8463347706781657e-05, + "loss": 0.3568, + "step": 1894 + }, + { + "epoch": 2.005291005291005, + "grad_norm": 0.34633898150173376, + "learning_rate": 1.84437475499804e-05, + "loss": 0.3465, + "step": 1895 + }, + { + "epoch": 2.0063492063492063, + "grad_norm": 0.2793278281768595, + "learning_rate": 1.842414739317915e-05, + "loss": 0.4017, + "step": 1896 + }, + { + "epoch": 2.0074074074074075, + "grad_norm": 0.3059555073494133, + "learning_rate": 1.840454723637789e-05, + "loss": 0.3392, + "step": 1897 + }, + { + "epoch": 2.0084656084656083, + "grad_norm": 0.26671660329309566, + "learning_rate": 1.8384947079576637e-05, + "loss": 0.3768, + "step": 1898 + }, + { + "epoch": 2.0095238095238095, + "grad_norm": 0.276118400635007, + "learning_rate": 1.8365346922775383e-05, + "loss": 0.3967, + "step": 1899 + }, + { + "epoch": 2.0105820105820107, + "grad_norm": 0.23862871261186194, + "learning_rate": 1.834574676597413e-05, + "loss": 0.3411, + "step": 1900 + }, + { + "epoch": 2.0116402116402115, + "grad_norm": 0.2602752683930678, + "learning_rate": 1.8326146609172875e-05, + "loss": 0.3598, + "step": 1901 + }, + { + "epoch": 2.0126984126984127, + "grad_norm": 0.24165262446248684, + "learning_rate": 1.830654645237162e-05, + "loss": 0.3637, + "step": 1902 + }, + { + "epoch": 2.013756613756614, + "grad_norm": 0.2211835840328273, + "learning_rate": 1.8286946295570363e-05, + "loss": 0.3475, + "step": 1903 + }, + { + "epoch": 2.0148148148148146, + "grad_norm": 0.2536735392338854, + "learning_rate": 1.8267346138769113e-05, + "loss": 0.371, + "step": 1904 + }, + { + "epoch": 2.015873015873016, + "grad_norm": 0.23225659690495537, + "learning_rate": 1.8247745981967855e-05, + "loss": 0.3231, + "step": 1905 + }, + { + "epoch": 2.016931216931217, + "grad_norm": 0.2363270132100584, + "learning_rate": 1.82281458251666e-05, + "loss": 0.4065, + "step": 1906 + }, + { + "epoch": 2.0179894179894178, + "grad_norm": 0.21698478882185174, + "learning_rate": 1.8208545668365347e-05, + "loss": 0.372, + "step": 1907 + }, + { + "epoch": 2.019047619047619, + "grad_norm": 0.23211712957073785, + "learning_rate": 1.8188945511564093e-05, + "loss": 0.3939, + "step": 1908 + }, + { + "epoch": 2.02010582010582, + "grad_norm": 0.21859722986589825, + "learning_rate": 1.816934535476284e-05, + "loss": 0.3096, + "step": 1909 + }, + { + "epoch": 2.0211640211640214, + "grad_norm": 0.22198129532334243, + "learning_rate": 1.8149745197961584e-05, + "loss": 0.3603, + "step": 1910 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.20161388915274267, + "learning_rate": 1.813014504116033e-05, + "loss": 0.3316, + "step": 1911 + }, + { + "epoch": 2.0232804232804233, + "grad_norm": 0.2169686284719689, + "learning_rate": 1.8110544884359076e-05, + "loss": 0.3631, + "step": 1912 + }, + { + "epoch": 2.0243386243386245, + "grad_norm": 0.20992722910943554, + "learning_rate": 1.8090944727557822e-05, + "loss": 0.354, + "step": 1913 + }, + { + "epoch": 2.0253968253968253, + "grad_norm": 0.21467603286816797, + "learning_rate": 1.8071344570756568e-05, + "loss": 0.3362, + "step": 1914 + }, + { + "epoch": 2.0264550264550265, + "grad_norm": 0.20786206596359347, + "learning_rate": 1.8051744413955314e-05, + "loss": 0.3244, + "step": 1915 + }, + { + "epoch": 2.0275132275132277, + "grad_norm": 0.21372256780280968, + "learning_rate": 1.8032144257154056e-05, + "loss": 0.371, + "step": 1916 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 0.21738963933410124, + "learning_rate": 1.8012544100352806e-05, + "loss": 0.3022, + "step": 1917 + }, + { + "epoch": 2.0296296296296297, + "grad_norm": 0.20703116847143097, + "learning_rate": 1.7992943943551548e-05, + "loss": 0.3333, + "step": 1918 + }, + { + "epoch": 2.030687830687831, + "grad_norm": 0.22313107632965132, + "learning_rate": 1.7973343786750294e-05, + "loss": 0.3615, + "step": 1919 + }, + { + "epoch": 2.0317460317460316, + "grad_norm": 0.20832627765877879, + "learning_rate": 1.795374362994904e-05, + "loss": 0.3038, + "step": 1920 + }, + { + "epoch": 2.032804232804233, + "grad_norm": 0.21323384648245985, + "learning_rate": 1.7934143473147786e-05, + "loss": 0.3509, + "step": 1921 + }, + { + "epoch": 2.033862433862434, + "grad_norm": 0.22323179664908244, + "learning_rate": 1.791454331634653e-05, + "loss": 0.3586, + "step": 1922 + }, + { + "epoch": 2.034920634920635, + "grad_norm": 0.2223456687807534, + "learning_rate": 1.7894943159545277e-05, + "loss": 0.3692, + "step": 1923 + }, + { + "epoch": 2.035978835978836, + "grad_norm": 0.20164634068506698, + "learning_rate": 1.7875343002744023e-05, + "loss": 0.3694, + "step": 1924 + }, + { + "epoch": 2.037037037037037, + "grad_norm": 0.20323586163027332, + "learning_rate": 1.785574284594277e-05, + "loss": 0.3405, + "step": 1925 + }, + { + "epoch": 2.038095238095238, + "grad_norm": 0.19413426270312215, + "learning_rate": 1.7836142689141515e-05, + "loss": 0.3272, + "step": 1926 + }, + { + "epoch": 2.039153439153439, + "grad_norm": 0.21682781764578435, + "learning_rate": 1.781654253234026e-05, + "loss": 0.3543, + "step": 1927 + }, + { + "epoch": 2.0402116402116404, + "grad_norm": 0.20226324872136348, + "learning_rate": 1.7796942375539007e-05, + "loss": 0.3249, + "step": 1928 + }, + { + "epoch": 2.041269841269841, + "grad_norm": 0.22193251820682366, + "learning_rate": 1.777734221873775e-05, + "loss": 0.3416, + "step": 1929 + }, + { + "epoch": 2.0423280423280423, + "grad_norm": 0.1835440881260506, + "learning_rate": 1.77577420619365e-05, + "loss": 0.3253, + "step": 1930 + }, + { + "epoch": 2.0433862433862435, + "grad_norm": 0.23125563586478098, + "learning_rate": 1.773814190513524e-05, + "loss": 0.3172, + "step": 1931 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.2192023103480394, + "learning_rate": 1.771854174833399e-05, + "loss": 0.3675, + "step": 1932 + }, + { + "epoch": 2.0455026455026455, + "grad_norm": 0.19764009006450245, + "learning_rate": 1.7698941591532733e-05, + "loss": 0.3108, + "step": 1933 + }, + { + "epoch": 2.0465608465608467, + "grad_norm": 0.21277420085835824, + "learning_rate": 1.767934143473148e-05, + "loss": 0.3796, + "step": 1934 + }, + { + "epoch": 2.0476190476190474, + "grad_norm": 0.2221209417138681, + "learning_rate": 1.7659741277930225e-05, + "loss": 0.3444, + "step": 1935 + }, + { + "epoch": 2.0486772486772487, + "grad_norm": 0.208598589107057, + "learning_rate": 1.764014112112897e-05, + "loss": 0.3416, + "step": 1936 + }, + { + "epoch": 2.04973544973545, + "grad_norm": 0.19795154801112497, + "learning_rate": 1.7620540964327713e-05, + "loss": 0.295, + "step": 1937 + }, + { + "epoch": 2.0507936507936506, + "grad_norm": 0.21861746828164197, + "learning_rate": 1.7600940807526462e-05, + "loss": 0.3672, + "step": 1938 + }, + { + "epoch": 2.051851851851852, + "grad_norm": 0.20848142512577938, + "learning_rate": 1.7581340650725205e-05, + "loss": 0.3503, + "step": 1939 + }, + { + "epoch": 2.052910052910053, + "grad_norm": 0.20406572548942745, + "learning_rate": 1.7561740493923954e-05, + "loss": 0.3594, + "step": 1940 + }, + { + "epoch": 2.0539682539682538, + "grad_norm": 0.22043618465867792, + "learning_rate": 1.7542140337122696e-05, + "loss": 0.3388, + "step": 1941 + }, + { + "epoch": 2.055026455026455, + "grad_norm": 0.20857631557038028, + "learning_rate": 1.7522540180321442e-05, + "loss": 0.3665, + "step": 1942 + }, + { + "epoch": 2.056084656084656, + "grad_norm": 0.21727483413067578, + "learning_rate": 1.7502940023520188e-05, + "loss": 0.4127, + "step": 1943 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.23262861070941543, + "learning_rate": 1.7483339866718934e-05, + "loss": 0.408, + "step": 1944 + }, + { + "epoch": 2.058201058201058, + "grad_norm": 0.19879438803020888, + "learning_rate": 1.746373970991768e-05, + "loss": 0.3254, + "step": 1945 + }, + { + "epoch": 2.0592592592592593, + "grad_norm": 0.21005560252157807, + "learning_rate": 1.7444139553116426e-05, + "loss": 0.3522, + "step": 1946 + }, + { + "epoch": 2.06031746031746, + "grad_norm": 0.22966010387646338, + "learning_rate": 1.742453939631517e-05, + "loss": 0.3749, + "step": 1947 + }, + { + "epoch": 2.0613756613756613, + "grad_norm": 0.19379588380401883, + "learning_rate": 1.7404939239513917e-05, + "loss": 0.3103, + "step": 1948 + }, + { + "epoch": 2.0624338624338625, + "grad_norm": 0.22978352813918101, + "learning_rate": 1.7385339082712663e-05, + "loss": 0.3673, + "step": 1949 + }, + { + "epoch": 2.0634920634920633, + "grad_norm": 0.2212514519753634, + "learning_rate": 1.736573892591141e-05, + "loss": 0.3597, + "step": 1950 + }, + { + "epoch": 2.0645502645502645, + "grad_norm": 0.20997192399458178, + "learning_rate": 1.7346138769110155e-05, + "loss": 0.3736, + "step": 1951 + }, + { + "epoch": 2.0656084656084657, + "grad_norm": 0.20665636238190924, + "learning_rate": 1.7326538612308898e-05, + "loss": 0.3523, + "step": 1952 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.20758995846695377, + "learning_rate": 1.7306938455507647e-05, + "loss": 0.3748, + "step": 1953 + }, + { + "epoch": 2.0677248677248676, + "grad_norm": 0.22564678363361657, + "learning_rate": 1.728733829870639e-05, + "loss": 0.336, + "step": 1954 + }, + { + "epoch": 2.068783068783069, + "grad_norm": 0.2516870186148532, + "learning_rate": 1.7267738141905135e-05, + "loss": 0.4016, + "step": 1955 + }, + { + "epoch": 2.06984126984127, + "grad_norm": 0.20252710114468478, + "learning_rate": 1.724813798510388e-05, + "loss": 0.3567, + "step": 1956 + }, + { + "epoch": 2.070899470899471, + "grad_norm": 0.2282832684870184, + "learning_rate": 1.7228537828302627e-05, + "loss": 0.3136, + "step": 1957 + }, + { + "epoch": 2.071957671957672, + "grad_norm": 0.2797927938402852, + "learning_rate": 1.7208937671501373e-05, + "loss": 0.3932, + "step": 1958 + }, + { + "epoch": 2.073015873015873, + "grad_norm": 0.20461018617775642, + "learning_rate": 1.718933751470012e-05, + "loss": 0.3397, + "step": 1959 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 0.21544569146507456, + "learning_rate": 1.7169737357898865e-05, + "loss": 0.3519, + "step": 1960 + }, + { + "epoch": 2.075132275132275, + "grad_norm": 0.22183783680712937, + "learning_rate": 1.715013720109761e-05, + "loss": 0.3515, + "step": 1961 + }, + { + "epoch": 2.0761904761904764, + "grad_norm": 0.22146564807753902, + "learning_rate": 1.7130537044296356e-05, + "loss": 0.3354, + "step": 1962 + }, + { + "epoch": 2.077248677248677, + "grad_norm": 0.19857023572678106, + "learning_rate": 1.7110936887495102e-05, + "loss": 0.3213, + "step": 1963 + }, + { + "epoch": 2.0783068783068783, + "grad_norm": 0.217266771807875, + "learning_rate": 1.7091336730693848e-05, + "loss": 0.3532, + "step": 1964 + }, + { + "epoch": 2.0793650793650795, + "grad_norm": 0.20005495691125305, + "learning_rate": 1.707173657389259e-05, + "loss": 0.3409, + "step": 1965 + }, + { + "epoch": 2.0804232804232803, + "grad_norm": 0.21330150110247303, + "learning_rate": 1.705213641709134e-05, + "loss": 0.3741, + "step": 1966 + }, + { + "epoch": 2.0814814814814815, + "grad_norm": 0.2278780973049878, + "learning_rate": 1.7032536260290082e-05, + "loss": 0.3468, + "step": 1967 + }, + { + "epoch": 2.0825396825396827, + "grad_norm": 0.2109353272677484, + "learning_rate": 1.7012936103488828e-05, + "loss": 0.3471, + "step": 1968 + }, + { + "epoch": 2.0835978835978834, + "grad_norm": 0.19895351419178237, + "learning_rate": 1.6993335946687574e-05, + "loss": 0.3492, + "step": 1969 + }, + { + "epoch": 2.0846560846560847, + "grad_norm": 0.207632263472837, + "learning_rate": 1.697373578988632e-05, + "loss": 0.3399, + "step": 1970 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.2163337881929324, + "learning_rate": 1.6954135633085066e-05, + "loss": 0.3255, + "step": 1971 + }, + { + "epoch": 2.0867724867724866, + "grad_norm": 0.23040692314017927, + "learning_rate": 1.6934535476283812e-05, + "loss": 0.3606, + "step": 1972 + }, + { + "epoch": 2.087830687830688, + "grad_norm": 0.21989016356512248, + "learning_rate": 1.6914935319482554e-05, + "loss": 0.3647, + "step": 1973 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.19613496455842805, + "learning_rate": 1.6895335162681303e-05, + "loss": 0.3203, + "step": 1974 + }, + { + "epoch": 2.0899470899470898, + "grad_norm": 0.22567442720889747, + "learning_rate": 1.6875735005880046e-05, + "loss": 0.3365, + "step": 1975 + }, + { + "epoch": 2.091005291005291, + "grad_norm": 0.22947949949294724, + "learning_rate": 1.6856134849078795e-05, + "loss": 0.3584, + "step": 1976 + }, + { + "epoch": 2.092063492063492, + "grad_norm": 0.23041866230411318, + "learning_rate": 1.6836534692277538e-05, + "loss": 0.3731, + "step": 1977 + }, + { + "epoch": 2.093121693121693, + "grad_norm": 0.21985790184342138, + "learning_rate": 1.6816934535476284e-05, + "loss": 0.3709, + "step": 1978 + }, + { + "epoch": 2.094179894179894, + "grad_norm": 0.19774950421675602, + "learning_rate": 1.679733437867503e-05, + "loss": 0.3409, + "step": 1979 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 0.2269563084196834, + "learning_rate": 1.6777734221873775e-05, + "loss": 0.3242, + "step": 1980 + }, + { + "epoch": 2.096296296296296, + "grad_norm": 0.23498422047692288, + "learning_rate": 1.675813406507252e-05, + "loss": 0.3583, + "step": 1981 + }, + { + "epoch": 2.0973544973544973, + "grad_norm": 0.20236703887981325, + "learning_rate": 1.6738533908271267e-05, + "loss": 0.3557, + "step": 1982 + }, + { + "epoch": 2.0984126984126985, + "grad_norm": 0.20883374092288098, + "learning_rate": 1.6718933751470013e-05, + "loss": 0.3285, + "step": 1983 + }, + { + "epoch": 2.0994708994708993, + "grad_norm": 0.2366397155580664, + "learning_rate": 1.669933359466876e-05, + "loss": 0.3539, + "step": 1984 + }, + { + "epoch": 2.1005291005291005, + "grad_norm": 0.2178711255251422, + "learning_rate": 1.6679733437867505e-05, + "loss": 0.329, + "step": 1985 + }, + { + "epoch": 2.1015873015873017, + "grad_norm": 0.21131026834720637, + "learning_rate": 1.6660133281066247e-05, + "loss": 0.3361, + "step": 1986 + }, + { + "epoch": 2.102645502645503, + "grad_norm": 0.2055578261425351, + "learning_rate": 1.6640533124264996e-05, + "loss": 0.3032, + "step": 1987 + }, + { + "epoch": 2.1037037037037036, + "grad_norm": 0.21812782402014477, + "learning_rate": 1.662093296746374e-05, + "loss": 0.296, + "step": 1988 + }, + { + "epoch": 2.104761904761905, + "grad_norm": 0.21287530507237762, + "learning_rate": 1.6601332810662488e-05, + "loss": 0.351, + "step": 1989 + }, + { + "epoch": 2.105820105820106, + "grad_norm": 0.20950973871134726, + "learning_rate": 1.658173265386123e-05, + "loss": 0.3575, + "step": 1990 + }, + { + "epoch": 2.106878306878307, + "grad_norm": 0.21396955753725935, + "learning_rate": 1.6562132497059977e-05, + "loss": 0.3424, + "step": 1991 + }, + { + "epoch": 2.107936507936508, + "grad_norm": 0.22616878855251082, + "learning_rate": 1.6542532340258722e-05, + "loss": 0.3662, + "step": 1992 + }, + { + "epoch": 2.108994708994709, + "grad_norm": 0.20008820458876664, + "learning_rate": 1.6522932183457468e-05, + "loss": 0.3108, + "step": 1993 + }, + { + "epoch": 2.11005291005291, + "grad_norm": 4.143863598055369, + "learning_rate": 1.6503332026656214e-05, + "loss": 0.4833, + "step": 1994 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.2292235575997881, + "learning_rate": 1.648373186985496e-05, + "loss": 0.3343, + "step": 1995 + }, + { + "epoch": 2.1121693121693124, + "grad_norm": 0.23419107839319792, + "learning_rate": 1.6464131713053706e-05, + "loss": 0.3347, + "step": 1996 + }, + { + "epoch": 2.113227513227513, + "grad_norm": 0.22940581819547085, + "learning_rate": 1.6444531556252452e-05, + "loss": 0.3951, + "step": 1997 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.23526414274011145, + "learning_rate": 1.6424931399451198e-05, + "loss": 0.3424, + "step": 1998 + }, + { + "epoch": 2.1153439153439155, + "grad_norm": 0.22735375376942824, + "learning_rate": 1.6405331242649944e-05, + "loss": 0.3649, + "step": 1999 + }, + { + "epoch": 2.1164021164021163, + "grad_norm": 0.21944975202575878, + "learning_rate": 1.638573108584869e-05, + "loss": 0.3376, + "step": 2000 + }, + { + "epoch": 2.1174603174603175, + "grad_norm": 0.5029143453384859, + "learning_rate": 1.6366130929047432e-05, + "loss": 0.3368, + "step": 2001 + }, + { + "epoch": 2.1185185185185187, + "grad_norm": 0.2504647540949935, + "learning_rate": 1.634653077224618e-05, + "loss": 0.365, + "step": 2002 + }, + { + "epoch": 2.1195767195767194, + "grad_norm": 0.2468901550955016, + "learning_rate": 1.6326930615444924e-05, + "loss": 0.3448, + "step": 2003 + }, + { + "epoch": 2.1206349206349207, + "grad_norm": 0.2197505442465071, + "learning_rate": 1.630733045864367e-05, + "loss": 0.361, + "step": 2004 + }, + { + "epoch": 2.121693121693122, + "grad_norm": 0.25391417814496214, + "learning_rate": 1.6287730301842415e-05, + "loss": 0.3581, + "step": 2005 + }, + { + "epoch": 2.1227513227513226, + "grad_norm": 0.25883936021880843, + "learning_rate": 1.626813014504116e-05, + "loss": 0.3505, + "step": 2006 + }, + { + "epoch": 2.123809523809524, + "grad_norm": 0.2520258145131729, + "learning_rate": 1.6248529988239907e-05, + "loss": 0.3398, + "step": 2007 + }, + { + "epoch": 2.124867724867725, + "grad_norm": 0.2203661316211868, + "learning_rate": 1.6228929831438653e-05, + "loss": 0.3627, + "step": 2008 + }, + { + "epoch": 2.1259259259259258, + "grad_norm": 0.23435328040308137, + "learning_rate": 1.6209329674637396e-05, + "loss": 0.3389, + "step": 2009 + }, + { + "epoch": 2.126984126984127, + "grad_norm": 0.27743951042442594, + "learning_rate": 1.6189729517836145e-05, + "loss": 0.3761, + "step": 2010 + }, + { + "epoch": 2.128042328042328, + "grad_norm": 0.20244205174436367, + "learning_rate": 1.6170129361034887e-05, + "loss": 0.3152, + "step": 2011 + }, + { + "epoch": 2.129100529100529, + "grad_norm": 0.20894323608250834, + "learning_rate": 1.6150529204233637e-05, + "loss": 0.3854, + "step": 2012 + }, + { + "epoch": 2.13015873015873, + "grad_norm": 0.24428589140845183, + "learning_rate": 1.613092904743238e-05, + "loss": 0.3876, + "step": 2013 + }, + { + "epoch": 2.1312169312169313, + "grad_norm": 0.24322774885752654, + "learning_rate": 1.6111328890631125e-05, + "loss": 0.3581, + "step": 2014 + }, + { + "epoch": 2.132275132275132, + "grad_norm": 0.20361725365232156, + "learning_rate": 1.609172873382987e-05, + "loss": 0.288, + "step": 2015 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.23886166021463448, + "learning_rate": 1.6072128577028617e-05, + "loss": 0.3867, + "step": 2016 + }, + { + "epoch": 2.1343915343915345, + "grad_norm": 0.20682128386588214, + "learning_rate": 1.6052528420227363e-05, + "loss": 0.3289, + "step": 2017 + }, + { + "epoch": 2.1354497354497353, + "grad_norm": 0.22555396551236676, + "learning_rate": 1.603292826342611e-05, + "loss": 0.336, + "step": 2018 + }, + { + "epoch": 2.1365079365079365, + "grad_norm": 0.22141671607674948, + "learning_rate": 1.6013328106624854e-05, + "loss": 0.3584, + "step": 2019 + }, + { + "epoch": 2.1375661375661377, + "grad_norm": 0.21896453846877967, + "learning_rate": 1.59937279498236e-05, + "loss": 0.3527, + "step": 2020 + }, + { + "epoch": 2.1386243386243384, + "grad_norm": 0.22228976521690386, + "learning_rate": 1.5974127793022346e-05, + "loss": 0.3923, + "step": 2021 + }, + { + "epoch": 2.1396825396825396, + "grad_norm": 0.22066786689884066, + "learning_rate": 1.595452763622109e-05, + "loss": 0.3625, + "step": 2022 + }, + { + "epoch": 2.140740740740741, + "grad_norm": 0.22383398867863769, + "learning_rate": 1.5934927479419838e-05, + "loss": 0.3741, + "step": 2023 + }, + { + "epoch": 2.1417989417989416, + "grad_norm": 0.21926212085215122, + "learning_rate": 1.591532732261858e-05, + "loss": 0.305, + "step": 2024 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.21328408231111298, + "learning_rate": 1.589572716581733e-05, + "loss": 0.3539, + "step": 2025 + }, + { + "epoch": 2.143915343915344, + "grad_norm": 0.20310210789488775, + "learning_rate": 1.5876127009016072e-05, + "loss": 0.3406, + "step": 2026 + }, + { + "epoch": 2.1449735449735448, + "grad_norm": 0.20985690382441458, + "learning_rate": 1.5856526852214818e-05, + "loss": 0.3298, + "step": 2027 + }, + { + "epoch": 2.146031746031746, + "grad_norm": 0.21426037653049163, + "learning_rate": 1.5836926695413564e-05, + "loss": 0.3076, + "step": 2028 + }, + { + "epoch": 2.147089947089947, + "grad_norm": 0.21677917505372724, + "learning_rate": 1.581732653861231e-05, + "loss": 0.3824, + "step": 2029 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 0.2064869233207918, + "learning_rate": 1.5797726381811055e-05, + "loss": 0.3436, + "step": 2030 + }, + { + "epoch": 2.149206349206349, + "grad_norm": 0.21971899343914275, + "learning_rate": 1.57781262250098e-05, + "loss": 0.369, + "step": 2031 + }, + { + "epoch": 2.1502645502645503, + "grad_norm": 0.3221845152874034, + "learning_rate": 1.5758526068208547e-05, + "loss": 0.3425, + "step": 2032 + }, + { + "epoch": 2.1513227513227515, + "grad_norm": 0.19874490771824377, + "learning_rate": 1.5738925911407293e-05, + "loss": 0.3184, + "step": 2033 + }, + { + "epoch": 2.1523809523809523, + "grad_norm": 10.905047393025432, + "learning_rate": 1.571932575460604e-05, + "loss": 1.0908, + "step": 2034 + }, + { + "epoch": 2.1534391534391535, + "grad_norm": 0.21735676542498997, + "learning_rate": 1.569972559780478e-05, + "loss": 0.3473, + "step": 2035 + }, + { + "epoch": 2.1544973544973547, + "grad_norm": 0.216545692844694, + "learning_rate": 1.568012544100353e-05, + "loss": 0.3582, + "step": 2036 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.21730583612765947, + "learning_rate": 1.5660525284202273e-05, + "loss": 0.3126, + "step": 2037 + }, + { + "epoch": 2.1566137566137566, + "grad_norm": 0.2004424058198479, + "learning_rate": 1.5640925127401022e-05, + "loss": 0.34, + "step": 2038 + }, + { + "epoch": 2.157671957671958, + "grad_norm": 0.2118306305932416, + "learning_rate": 1.5621324970599765e-05, + "loss": 0.307, + "step": 2039 + }, + { + "epoch": 2.1587301587301586, + "grad_norm": 0.22105484159810915, + "learning_rate": 1.560172481379851e-05, + "loss": 0.3747, + "step": 2040 + }, + { + "epoch": 2.15978835978836, + "grad_norm": 0.20960788510999773, + "learning_rate": 1.5582124656997257e-05, + "loss": 0.3417, + "step": 2041 + }, + { + "epoch": 2.160846560846561, + "grad_norm": 0.21884953306007163, + "learning_rate": 1.5562524500196003e-05, + "loss": 0.3512, + "step": 2042 + }, + { + "epoch": 2.1619047619047618, + "grad_norm": 0.23243491580846684, + "learning_rate": 1.554292434339475e-05, + "loss": 0.3841, + "step": 2043 + }, + { + "epoch": 2.162962962962963, + "grad_norm": 0.23140426219629298, + "learning_rate": 1.5523324186593494e-05, + "loss": 0.405, + "step": 2044 + }, + { + "epoch": 2.164021164021164, + "grad_norm": 0.20298786257942408, + "learning_rate": 1.5503724029792237e-05, + "loss": 0.3551, + "step": 2045 + }, + { + "epoch": 2.165079365079365, + "grad_norm": 0.21348196250544565, + "learning_rate": 1.5484123872990986e-05, + "loss": 0.3378, + "step": 2046 + }, + { + "epoch": 2.166137566137566, + "grad_norm": 0.2439977018458661, + "learning_rate": 1.546452371618973e-05, + "loss": 0.3773, + "step": 2047 + }, + { + "epoch": 2.1671957671957673, + "grad_norm": 0.19895306399691473, + "learning_rate": 1.5444923559388474e-05, + "loss": 0.3209, + "step": 2048 + }, + { + "epoch": 2.168253968253968, + "grad_norm": 0.21695598379114167, + "learning_rate": 1.542532340258722e-05, + "loss": 0.3675, + "step": 2049 + }, + { + "epoch": 2.1693121693121693, + "grad_norm": 0.1794392725994556, + "learning_rate": 1.5405723245785966e-05, + "loss": 0.2931, + "step": 2050 + }, + { + "epoch": 2.1703703703703705, + "grad_norm": 0.2038700225689888, + "learning_rate": 1.5386123088984712e-05, + "loss": 0.3375, + "step": 2051 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.22597709606412955, + "learning_rate": 1.5366522932183458e-05, + "loss": 0.3576, + "step": 2052 + }, + { + "epoch": 2.1724867724867725, + "grad_norm": 0.2077246480006038, + "learning_rate": 1.5346922775382204e-05, + "loss": 0.3344, + "step": 2053 + }, + { + "epoch": 2.1735449735449737, + "grad_norm": 0.20000941756704047, + "learning_rate": 1.532732261858095e-05, + "loss": 0.3382, + "step": 2054 + }, + { + "epoch": 2.1746031746031744, + "grad_norm": 0.19713016775489853, + "learning_rate": 1.5307722461779696e-05, + "loss": 0.3333, + "step": 2055 + }, + { + "epoch": 2.1756613756613756, + "grad_norm": 0.19246485509834052, + "learning_rate": 1.528812230497844e-05, + "loss": 0.305, + "step": 2056 + }, + { + "epoch": 2.176719576719577, + "grad_norm": 0.20627954988108957, + "learning_rate": 1.5268522148177187e-05, + "loss": 0.3102, + "step": 2057 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.21828606128129674, + "learning_rate": 1.5248921991375931e-05, + "loss": 0.38, + "step": 2058 + }, + { + "epoch": 2.178835978835979, + "grad_norm": 0.20648370010623027, + "learning_rate": 1.5229321834574677e-05, + "loss": 0.3356, + "step": 2059 + }, + { + "epoch": 2.17989417989418, + "grad_norm": 0.21130639938010654, + "learning_rate": 1.5209721677773423e-05, + "loss": 0.3638, + "step": 2060 + }, + { + "epoch": 2.1809523809523808, + "grad_norm": 0.2115670503852573, + "learning_rate": 1.5190121520972169e-05, + "loss": 0.3432, + "step": 2061 + }, + { + "epoch": 2.182010582010582, + "grad_norm": 0.1908436166806636, + "learning_rate": 1.5170521364170915e-05, + "loss": 0.3041, + "step": 2062 + }, + { + "epoch": 2.183068783068783, + "grad_norm": 0.2020564072365534, + "learning_rate": 1.515092120736966e-05, + "loss": 0.336, + "step": 2063 + }, + { + "epoch": 2.1841269841269844, + "grad_norm": 0.2161693720721347, + "learning_rate": 1.5131321050568407e-05, + "loss": 0.3545, + "step": 2064 + }, + { + "epoch": 2.185185185185185, + "grad_norm": 0.22473602950532073, + "learning_rate": 1.5111720893767151e-05, + "loss": 0.342, + "step": 2065 + }, + { + "epoch": 2.1862433862433863, + "grad_norm": 0.2150375377284277, + "learning_rate": 1.5092120736965895e-05, + "loss": 0.3833, + "step": 2066 + }, + { + "epoch": 2.1873015873015875, + "grad_norm": 0.21870865973475231, + "learning_rate": 1.5072520580164643e-05, + "loss": 0.3534, + "step": 2067 + }, + { + "epoch": 2.1883597883597883, + "grad_norm": 0.21446734495632633, + "learning_rate": 1.5052920423363387e-05, + "loss": 0.3606, + "step": 2068 + }, + { + "epoch": 2.1894179894179895, + "grad_norm": 0.22888453880818366, + "learning_rate": 1.5033320266562134e-05, + "loss": 0.3907, + "step": 2069 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.21937778969235877, + "learning_rate": 1.5013720109760879e-05, + "loss": 0.3293, + "step": 2070 + }, + { + "epoch": 2.1915343915343914, + "grad_norm": 0.20058436964581325, + "learning_rate": 1.4994119952959623e-05, + "loss": 0.3137, + "step": 2071 + }, + { + "epoch": 2.1925925925925926, + "grad_norm": 0.23325324698372354, + "learning_rate": 1.497451979615837e-05, + "loss": 0.3512, + "step": 2072 + }, + { + "epoch": 2.193650793650794, + "grad_norm": 0.21063385449656516, + "learning_rate": 1.4954919639357115e-05, + "loss": 0.3487, + "step": 2073 + }, + { + "epoch": 2.1947089947089946, + "grad_norm": 0.22182976034778512, + "learning_rate": 1.4935319482555862e-05, + "loss": 0.3783, + "step": 2074 + }, + { + "epoch": 2.195767195767196, + "grad_norm": 0.20256863181044094, + "learning_rate": 1.4915719325754606e-05, + "loss": 0.3231, + "step": 2075 + }, + { + "epoch": 2.196825396825397, + "grad_norm": 0.23952493244674197, + "learning_rate": 1.4896119168953352e-05, + "loss": 0.3828, + "step": 2076 + }, + { + "epoch": 2.1978835978835978, + "grad_norm": 0.2161109491007153, + "learning_rate": 1.4876519012152098e-05, + "loss": 0.3379, + "step": 2077 + }, + { + "epoch": 2.198941798941799, + "grad_norm": 0.21208428593234505, + "learning_rate": 1.4856918855350844e-05, + "loss": 0.3386, + "step": 2078 + }, + { + "epoch": 2.2, + "grad_norm": 0.20469783702775474, + "learning_rate": 1.483731869854959e-05, + "loss": 0.3285, + "step": 2079 + }, + { + "epoch": 2.201058201058201, + "grad_norm": 0.20106413317496333, + "learning_rate": 1.4817718541748336e-05, + "loss": 0.3191, + "step": 2080 + }, + { + "epoch": 2.202116402116402, + "grad_norm": 0.20886186796806508, + "learning_rate": 1.479811838494708e-05, + "loss": 0.3638, + "step": 2081 + }, + { + "epoch": 2.2031746031746033, + "grad_norm": 0.2024596948339384, + "learning_rate": 1.4778518228145827e-05, + "loss": 0.3497, + "step": 2082 + }, + { + "epoch": 2.204232804232804, + "grad_norm": 0.19966325145216773, + "learning_rate": 1.4758918071344572e-05, + "loss": 0.3598, + "step": 2083 + }, + { + "epoch": 2.2052910052910053, + "grad_norm": 0.19469952860860135, + "learning_rate": 1.4739317914543316e-05, + "loss": 0.3035, + "step": 2084 + }, + { + "epoch": 2.2063492063492065, + "grad_norm": 0.21170064193002702, + "learning_rate": 1.4719717757742063e-05, + "loss": 0.304, + "step": 2085 + }, + { + "epoch": 2.2074074074074073, + "grad_norm": 0.20805361694025837, + "learning_rate": 1.4700117600940808e-05, + "loss": 0.3511, + "step": 2086 + }, + { + "epoch": 2.2084656084656085, + "grad_norm": 0.2055245437744949, + "learning_rate": 1.4680517444139555e-05, + "loss": 0.3544, + "step": 2087 + }, + { + "epoch": 2.2095238095238097, + "grad_norm": 0.201237038494924, + "learning_rate": 1.46609172873383e-05, + "loss": 0.3253, + "step": 2088 + }, + { + "epoch": 2.2105820105820104, + "grad_norm": 0.22354747868539224, + "learning_rate": 1.4641317130537043e-05, + "loss": 0.3795, + "step": 2089 + }, + { + "epoch": 2.2116402116402116, + "grad_norm": 0.22190512018803507, + "learning_rate": 1.4621716973735791e-05, + "loss": 0.3626, + "step": 2090 + }, + { + "epoch": 2.212698412698413, + "grad_norm": 0.3024061933530375, + "learning_rate": 1.4602116816934535e-05, + "loss": 0.3426, + "step": 2091 + }, + { + "epoch": 2.2137566137566136, + "grad_norm": 0.22362434974122583, + "learning_rate": 1.4582516660133283e-05, + "loss": 0.3568, + "step": 2092 + }, + { + "epoch": 2.214814814814815, + "grad_norm": 0.2120000539713426, + "learning_rate": 1.4562916503332027e-05, + "loss": 0.3436, + "step": 2093 + }, + { + "epoch": 2.215873015873016, + "grad_norm": 0.2273394265772914, + "learning_rate": 1.4543316346530773e-05, + "loss": 0.347, + "step": 2094 + }, + { + "epoch": 2.2169312169312168, + "grad_norm": 0.20044541135515623, + "learning_rate": 1.4523716189729519e-05, + "loss": 0.3415, + "step": 2095 + }, + { + "epoch": 2.217989417989418, + "grad_norm": 0.20384591070530153, + "learning_rate": 1.4504116032928265e-05, + "loss": 0.3278, + "step": 2096 + }, + { + "epoch": 2.219047619047619, + "grad_norm": 0.25810597441442096, + "learning_rate": 1.4484515876127009e-05, + "loss": 0.3534, + "step": 2097 + }, + { + "epoch": 2.22010582010582, + "grad_norm": 0.22795352736979752, + "learning_rate": 1.4464915719325756e-05, + "loss": 0.3455, + "step": 2098 + }, + { + "epoch": 2.221164021164021, + "grad_norm": 0.21049254714677118, + "learning_rate": 1.44453155625245e-05, + "loss": 0.3808, + "step": 2099 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.23067771453190306, + "learning_rate": 1.4425715405723248e-05, + "loss": 0.3285, + "step": 2100 + }, + { + "epoch": 2.223280423280423, + "grad_norm": 0.2200823890048532, + "learning_rate": 1.4406115248921992e-05, + "loss": 0.3497, + "step": 2101 + }, + { + "epoch": 2.2243386243386243, + "grad_norm": 3.4221269378218153, + "learning_rate": 1.4386515092120736e-05, + "loss": 0.4657, + "step": 2102 + }, + { + "epoch": 2.2253968253968255, + "grad_norm": 0.23261505421987935, + "learning_rate": 1.4366914935319484e-05, + "loss": 0.3376, + "step": 2103 + }, + { + "epoch": 2.2264550264550262, + "grad_norm": 0.20182994591620118, + "learning_rate": 1.4347314778518228e-05, + "loss": 0.3526, + "step": 2104 + }, + { + "epoch": 2.2275132275132274, + "grad_norm": 0.21082285581411647, + "learning_rate": 1.4327714621716976e-05, + "loss": 0.3552, + "step": 2105 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 0.22032489334949037, + "learning_rate": 1.430811446491572e-05, + "loss": 0.3756, + "step": 2106 + }, + { + "epoch": 2.2296296296296294, + "grad_norm": 0.2159520496931439, + "learning_rate": 1.4288514308114464e-05, + "loss": 0.3592, + "step": 2107 + }, + { + "epoch": 2.2306878306878306, + "grad_norm": 0.20689680272282845, + "learning_rate": 1.4268914151313212e-05, + "loss": 0.3491, + "step": 2108 + }, + { + "epoch": 2.231746031746032, + "grad_norm": 0.19169011186080528, + "learning_rate": 1.4249313994511956e-05, + "loss": 0.3355, + "step": 2109 + }, + { + "epoch": 2.2328042328042326, + "grad_norm": 0.19803274438625507, + "learning_rate": 1.4229713837710703e-05, + "loss": 0.3211, + "step": 2110 + }, + { + "epoch": 2.2338624338624338, + "grad_norm": 0.2169935695452419, + "learning_rate": 1.4210113680909448e-05, + "loss": 0.3789, + "step": 2111 + }, + { + "epoch": 2.234920634920635, + "grad_norm": 0.20431499248719218, + "learning_rate": 1.4190513524108193e-05, + "loss": 0.3162, + "step": 2112 + }, + { + "epoch": 2.235978835978836, + "grad_norm": 0.2079250549519578, + "learning_rate": 1.417091336730694e-05, + "loss": 0.3469, + "step": 2113 + }, + { + "epoch": 2.237037037037037, + "grad_norm": 0.22941584714468002, + "learning_rate": 1.4151313210505685e-05, + "loss": 0.405, + "step": 2114 + }, + { + "epoch": 2.238095238095238, + "grad_norm": 0.19630828002495715, + "learning_rate": 1.413171305370443e-05, + "loss": 0.3515, + "step": 2115 + }, + { + "epoch": 2.2391534391534393, + "grad_norm": 0.210404649350759, + "learning_rate": 1.4112112896903177e-05, + "loss": 0.3376, + "step": 2116 + }, + { + "epoch": 2.24021164021164, + "grad_norm": 0.2376328983905229, + "learning_rate": 1.4092512740101921e-05, + "loss": 0.388, + "step": 2117 + }, + { + "epoch": 2.2412698412698413, + "grad_norm": 0.21383169199679727, + "learning_rate": 1.4072912583300669e-05, + "loss": 0.3199, + "step": 2118 + }, + { + "epoch": 2.2423280423280425, + "grad_norm": 0.20202389531154502, + "learning_rate": 1.4053312426499413e-05, + "loss": 0.3684, + "step": 2119 + }, + { + "epoch": 2.2433862433862433, + "grad_norm": 0.32250759227392084, + "learning_rate": 1.4033712269698157e-05, + "loss": 0.3633, + "step": 2120 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.23290731003562973, + "learning_rate": 1.4014112112896905e-05, + "loss": 0.4027, + "step": 2121 + }, + { + "epoch": 2.2455026455026457, + "grad_norm": 0.2116123949601418, + "learning_rate": 1.3994511956095649e-05, + "loss": 0.3819, + "step": 2122 + }, + { + "epoch": 2.2465608465608464, + "grad_norm": 0.21423684536621376, + "learning_rate": 1.3974911799294396e-05, + "loss": 0.3684, + "step": 2123 + }, + { + "epoch": 2.2476190476190476, + "grad_norm": 0.2052592452761068, + "learning_rate": 1.395531164249314e-05, + "loss": 0.3344, + "step": 2124 + }, + { + "epoch": 2.248677248677249, + "grad_norm": 0.20079823755779902, + "learning_rate": 1.3935711485691885e-05, + "loss": 0.3618, + "step": 2125 + }, + { + "epoch": 2.2497354497354496, + "grad_norm": 1.0894959314523671, + "learning_rate": 1.3916111328890632e-05, + "loss": 0.3475, + "step": 2126 + }, + { + "epoch": 2.250793650793651, + "grad_norm": 0.2099112875651436, + "learning_rate": 1.3896511172089377e-05, + "loss": 0.3252, + "step": 2127 + }, + { + "epoch": 2.251851851851852, + "grad_norm": 0.20136693529793254, + "learning_rate": 1.3876911015288124e-05, + "loss": 0.3378, + "step": 2128 + }, + { + "epoch": 2.2529100529100528, + "grad_norm": 0.2105920879605314, + "learning_rate": 1.3857310858486868e-05, + "loss": 0.3455, + "step": 2129 + }, + { + "epoch": 2.253968253968254, + "grad_norm": 0.21728105108056908, + "learning_rate": 1.3837710701685614e-05, + "loss": 0.3555, + "step": 2130 + }, + { + "epoch": 2.255026455026455, + "grad_norm": 0.19483145128816542, + "learning_rate": 1.381811054488436e-05, + "loss": 0.3163, + "step": 2131 + }, + { + "epoch": 2.256084656084656, + "grad_norm": 0.18761723281777778, + "learning_rate": 1.3798510388083106e-05, + "loss": 0.3057, + "step": 2132 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.2031218040229401, + "learning_rate": 1.377891023128185e-05, + "loss": 0.3118, + "step": 2133 + }, + { + "epoch": 2.2582010582010583, + "grad_norm": 0.19461075746598536, + "learning_rate": 1.3759310074480598e-05, + "loss": 0.3307, + "step": 2134 + }, + { + "epoch": 2.259259259259259, + "grad_norm": 0.23054743512215764, + "learning_rate": 1.3739709917679342e-05, + "loss": 0.4172, + "step": 2135 + }, + { + "epoch": 2.2603174603174603, + "grad_norm": 0.21187621751878205, + "learning_rate": 1.372010976087809e-05, + "loss": 0.3666, + "step": 2136 + }, + { + "epoch": 2.2613756613756615, + "grad_norm": 0.22640129012026444, + "learning_rate": 1.3700509604076834e-05, + "loss": 0.3875, + "step": 2137 + }, + { + "epoch": 2.2624338624338622, + "grad_norm": 0.19823862095773512, + "learning_rate": 1.3680909447275578e-05, + "loss": 0.2947, + "step": 2138 + }, + { + "epoch": 2.2634920634920634, + "grad_norm": 0.21825276904693997, + "learning_rate": 1.3661309290474325e-05, + "loss": 0.3833, + "step": 2139 + }, + { + "epoch": 2.2645502645502646, + "grad_norm": 0.21409936187628198, + "learning_rate": 1.364170913367307e-05, + "loss": 0.3485, + "step": 2140 + }, + { + "epoch": 2.265608465608466, + "grad_norm": 0.19696631739592757, + "learning_rate": 1.3622108976871817e-05, + "loss": 0.325, + "step": 2141 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.18972546473558874, + "learning_rate": 1.3602508820070561e-05, + "loss": 0.3315, + "step": 2142 + }, + { + "epoch": 2.267724867724868, + "grad_norm": 0.20576822255139696, + "learning_rate": 1.3582908663269305e-05, + "loss": 0.3432, + "step": 2143 + }, + { + "epoch": 2.268783068783069, + "grad_norm": 1.5364918959173623, + "learning_rate": 1.3563308506468053e-05, + "loss": 0.3966, + "step": 2144 + }, + { + "epoch": 2.2698412698412698, + "grad_norm": 0.21478874797886527, + "learning_rate": 1.3543708349666797e-05, + "loss": 0.3491, + "step": 2145 + }, + { + "epoch": 2.270899470899471, + "grad_norm": 0.21276572642119787, + "learning_rate": 1.3524108192865541e-05, + "loss": 0.3825, + "step": 2146 + }, + { + "epoch": 2.271957671957672, + "grad_norm": 0.1955401249488189, + "learning_rate": 1.3504508036064289e-05, + "loss": 0.3066, + "step": 2147 + }, + { + "epoch": 2.273015873015873, + "grad_norm": 0.19895969650589623, + "learning_rate": 1.3484907879263035e-05, + "loss": 0.3479, + "step": 2148 + }, + { + "epoch": 2.274074074074074, + "grad_norm": 0.21571935832308445, + "learning_rate": 1.346530772246178e-05, + "loss": 0.3688, + "step": 2149 + }, + { + "epoch": 2.2751322751322753, + "grad_norm": 0.21735528938360044, + "learning_rate": 1.3445707565660527e-05, + "loss": 0.3332, + "step": 2150 + }, + { + "epoch": 2.276190476190476, + "grad_norm": 0.19334954892944037, + "learning_rate": 1.342610740885927e-05, + "loss": 0.313, + "step": 2151 + }, + { + "epoch": 2.2772486772486773, + "grad_norm": 0.20885026871074683, + "learning_rate": 1.3406507252058018e-05, + "loss": 0.3296, + "step": 2152 + }, + { + "epoch": 2.2783068783068785, + "grad_norm": 0.1976772642773375, + "learning_rate": 1.3386907095256762e-05, + "loss": 0.3354, + "step": 2153 + }, + { + "epoch": 2.2793650793650793, + "grad_norm": 0.21922541090556855, + "learning_rate": 1.336730693845551e-05, + "loss": 0.3702, + "step": 2154 + }, + { + "epoch": 2.2804232804232805, + "grad_norm": 0.22298245670974845, + "learning_rate": 1.3347706781654254e-05, + "loss": 0.3339, + "step": 2155 + }, + { + "epoch": 2.2814814814814817, + "grad_norm": 0.21558822449093906, + "learning_rate": 1.3328106624852998e-05, + "loss": 0.3699, + "step": 2156 + }, + { + "epoch": 2.2825396825396824, + "grad_norm": 0.20618573290980355, + "learning_rate": 1.3308506468051746e-05, + "loss": 0.3411, + "step": 2157 + }, + { + "epoch": 2.2835978835978836, + "grad_norm": 0.22181612653063038, + "learning_rate": 1.328890631125049e-05, + "loss": 0.3479, + "step": 2158 + }, + { + "epoch": 2.284656084656085, + "grad_norm": 0.2033737787576765, + "learning_rate": 1.3269306154449238e-05, + "loss": 0.309, + "step": 2159 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.2304280611065484, + "learning_rate": 1.3249705997647982e-05, + "loss": 0.3606, + "step": 2160 + }, + { + "epoch": 2.286772486772487, + "grad_norm": 0.21105846127089964, + "learning_rate": 1.3230105840846726e-05, + "loss": 0.3343, + "step": 2161 + }, + { + "epoch": 2.287830687830688, + "grad_norm": 0.21136860640342398, + "learning_rate": 1.3210505684045474e-05, + "loss": 0.3407, + "step": 2162 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.21010473995437073, + "learning_rate": 1.3190905527244218e-05, + "loss": 0.3338, + "step": 2163 + }, + { + "epoch": 2.28994708994709, + "grad_norm": 0.21899937987852755, + "learning_rate": 1.3171305370442964e-05, + "loss": 0.3486, + "step": 2164 + }, + { + "epoch": 2.291005291005291, + "grad_norm": 0.1975690727274802, + "learning_rate": 1.315170521364171e-05, + "loss": 0.3115, + "step": 2165 + }, + { + "epoch": 2.292063492063492, + "grad_norm": 0.19198092778024467, + "learning_rate": 1.3132105056840455e-05, + "loss": 0.3275, + "step": 2166 + }, + { + "epoch": 2.293121693121693, + "grad_norm": 0.19304872180177207, + "learning_rate": 1.3112504900039201e-05, + "loss": 0.3058, + "step": 2167 + }, + { + "epoch": 2.2941798941798943, + "grad_norm": 0.22184189861680062, + "learning_rate": 1.3092904743237947e-05, + "loss": 0.3557, + "step": 2168 + }, + { + "epoch": 2.295238095238095, + "grad_norm": 0.2044727357753798, + "learning_rate": 1.3073304586436691e-05, + "loss": 0.3775, + "step": 2169 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 0.2109940769725802, + "learning_rate": 1.3053704429635439e-05, + "loss": 0.3606, + "step": 2170 + }, + { + "epoch": 2.2973544973544975, + "grad_norm": 0.20603745974637752, + "learning_rate": 1.3034104272834183e-05, + "loss": 0.3568, + "step": 2171 + }, + { + "epoch": 2.2984126984126982, + "grad_norm": 0.20723859027326352, + "learning_rate": 1.301450411603293e-05, + "loss": 0.3598, + "step": 2172 + }, + { + "epoch": 2.2994708994708994, + "grad_norm": 0.22014955361649102, + "learning_rate": 1.2994903959231675e-05, + "loss": 0.3645, + "step": 2173 + }, + { + "epoch": 2.3005291005291006, + "grad_norm": 0.21240912935645942, + "learning_rate": 1.2975303802430419e-05, + "loss": 0.3227, + "step": 2174 + }, + { + "epoch": 2.3015873015873014, + "grad_norm": 0.1984055417276385, + "learning_rate": 1.2955703645629167e-05, + "loss": 0.3403, + "step": 2175 + }, + { + "epoch": 2.3026455026455026, + "grad_norm": 0.20615107635995517, + "learning_rate": 1.293610348882791e-05, + "loss": 0.357, + "step": 2176 + }, + { + "epoch": 2.303703703703704, + "grad_norm": 0.2883475072855802, + "learning_rate": 1.2916503332026658e-05, + "loss": 0.328, + "step": 2177 + }, + { + "epoch": 2.3047619047619046, + "grad_norm": 0.21617403311285718, + "learning_rate": 1.2896903175225403e-05, + "loss": 0.3207, + "step": 2178 + }, + { + "epoch": 2.3058201058201058, + "grad_norm": 0.22261165487391843, + "learning_rate": 1.2877303018424147e-05, + "loss": 0.3905, + "step": 2179 + }, + { + "epoch": 2.306878306878307, + "grad_norm": 0.19746052299888087, + "learning_rate": 1.2857702861622894e-05, + "loss": 0.3344, + "step": 2180 + }, + { + "epoch": 2.3079365079365077, + "grad_norm": 0.20619793904799188, + "learning_rate": 1.2838102704821638e-05, + "loss": 0.3187, + "step": 2181 + }, + { + "epoch": 2.308994708994709, + "grad_norm": 0.21727096923527425, + "learning_rate": 1.2818502548020384e-05, + "loss": 0.393, + "step": 2182 + }, + { + "epoch": 2.31005291005291, + "grad_norm": 0.22496945697900078, + "learning_rate": 1.279890239121913e-05, + "loss": 0.4016, + "step": 2183 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.20878049645676627, + "learning_rate": 1.2779302234417876e-05, + "loss": 0.3308, + "step": 2184 + }, + { + "epoch": 2.312169312169312, + "grad_norm": 0.19876898454737266, + "learning_rate": 1.2759702077616622e-05, + "loss": 0.3472, + "step": 2185 + }, + { + "epoch": 2.3132275132275133, + "grad_norm": 0.22470198675510053, + "learning_rate": 1.2740101920815368e-05, + "loss": 0.3723, + "step": 2186 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 0.2149186208399833, + "learning_rate": 1.2720501764014112e-05, + "loss": 0.3609, + "step": 2187 + }, + { + "epoch": 2.3153439153439153, + "grad_norm": 0.21505615763181798, + "learning_rate": 1.270090160721286e-05, + "loss": 0.3775, + "step": 2188 + }, + { + "epoch": 2.3164021164021165, + "grad_norm": 0.21734973914888298, + "learning_rate": 1.2681301450411604e-05, + "loss": 0.3578, + "step": 2189 + }, + { + "epoch": 2.317460317460317, + "grad_norm": 0.19738584826580072, + "learning_rate": 1.2661701293610351e-05, + "loss": 0.3114, + "step": 2190 + }, + { + "epoch": 2.3185185185185184, + "grad_norm": 0.2059651241318454, + "learning_rate": 1.2642101136809096e-05, + "loss": 0.3503, + "step": 2191 + }, + { + "epoch": 2.3195767195767196, + "grad_norm": 0.21191185476329366, + "learning_rate": 1.262250098000784e-05, + "loss": 0.3277, + "step": 2192 + }, + { + "epoch": 2.320634920634921, + "grad_norm": 0.21822223673790192, + "learning_rate": 1.2602900823206587e-05, + "loss": 0.3402, + "step": 2193 + }, + { + "epoch": 2.3216931216931216, + "grad_norm": 0.22119523492818854, + "learning_rate": 1.2583300666405331e-05, + "loss": 0.3931, + "step": 2194 + }, + { + "epoch": 2.322751322751323, + "grad_norm": 0.18136041095386, + "learning_rate": 1.2563700509604076e-05, + "loss": 0.3065, + "step": 2195 + }, + { + "epoch": 2.323809523809524, + "grad_norm": 0.28983751655981344, + "learning_rate": 1.2544100352802823e-05, + "loss": 0.366, + "step": 2196 + }, + { + "epoch": 2.3248677248677247, + "grad_norm": 0.2191979489235065, + "learning_rate": 1.2524500196001567e-05, + "loss": 0.3444, + "step": 2197 + }, + { + "epoch": 2.325925925925926, + "grad_norm": 0.200384296867351, + "learning_rate": 1.2504900039200315e-05, + "loss": 0.3277, + "step": 2198 + }, + { + "epoch": 2.326984126984127, + "grad_norm": 0.20207450606748373, + "learning_rate": 1.248529988239906e-05, + "loss": 0.3619, + "step": 2199 + }, + { + "epoch": 2.328042328042328, + "grad_norm": 0.20817037763377247, + "learning_rate": 1.2465699725597805e-05, + "loss": 0.349, + "step": 2200 + }, + { + "epoch": 2.329100529100529, + "grad_norm": 0.20772850458509592, + "learning_rate": 1.2446099568796551e-05, + "loss": 0.3289, + "step": 2201 + }, + { + "epoch": 2.3301587301587303, + "grad_norm": 1.2036917950362793, + "learning_rate": 1.2426499411995297e-05, + "loss": 0.4814, + "step": 2202 + }, + { + "epoch": 2.331216931216931, + "grad_norm": 0.21881788351835138, + "learning_rate": 1.2406899255194043e-05, + "loss": 0.3625, + "step": 2203 + }, + { + "epoch": 2.3322751322751323, + "grad_norm": 0.20154036096455355, + "learning_rate": 1.2387299098392789e-05, + "loss": 0.3686, + "step": 2204 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.20121002730380697, + "learning_rate": 1.2367698941591534e-05, + "loss": 0.3603, + "step": 2205 + }, + { + "epoch": 2.3343915343915342, + "grad_norm": 0.2197435442897782, + "learning_rate": 1.234809878479028e-05, + "loss": 0.349, + "step": 2206 + }, + { + "epoch": 2.3354497354497354, + "grad_norm": 0.21748291890475052, + "learning_rate": 1.2328498627989024e-05, + "loss": 0.4108, + "step": 2207 + }, + { + "epoch": 2.3365079365079366, + "grad_norm": 0.18641463759702262, + "learning_rate": 1.230889847118777e-05, + "loss": 0.31, + "step": 2208 + }, + { + "epoch": 2.3375661375661374, + "grad_norm": 0.19503644892500818, + "learning_rate": 1.2289298314386516e-05, + "loss": 0.3595, + "step": 2209 + }, + { + "epoch": 2.3386243386243386, + "grad_norm": 0.20574304094175103, + "learning_rate": 1.2269698157585262e-05, + "loss": 0.3666, + "step": 2210 + }, + { + "epoch": 2.33968253968254, + "grad_norm": 0.20069061348310832, + "learning_rate": 1.2250098000784006e-05, + "loss": 0.3123, + "step": 2211 + }, + { + "epoch": 2.3407407407407406, + "grad_norm": 0.20015298378799415, + "learning_rate": 1.2230497843982752e-05, + "loss": 0.3545, + "step": 2212 + }, + { + "epoch": 2.3417989417989418, + "grad_norm": 0.19565316573476668, + "learning_rate": 1.2210897687181498e-05, + "loss": 0.3481, + "step": 2213 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.213652830993624, + "learning_rate": 1.2191297530380244e-05, + "loss": 0.3492, + "step": 2214 + }, + { + "epoch": 2.3439153439153437, + "grad_norm": 0.211527905920233, + "learning_rate": 1.2171697373578988e-05, + "loss": 0.372, + "step": 2215 + }, + { + "epoch": 2.344973544973545, + "grad_norm": 0.20693192379565375, + "learning_rate": 1.2152097216777734e-05, + "loss": 0.3772, + "step": 2216 + }, + { + "epoch": 2.346031746031746, + "grad_norm": 0.20506549919992692, + "learning_rate": 1.213249705997648e-05, + "loss": 0.3506, + "step": 2217 + }, + { + "epoch": 2.3470899470899473, + "grad_norm": 0.29640440277261043, + "learning_rate": 1.2112896903175226e-05, + "loss": 0.4008, + "step": 2218 + }, + { + "epoch": 2.348148148148148, + "grad_norm": 0.21563211935876653, + "learning_rate": 1.2093296746373972e-05, + "loss": 0.366, + "step": 2219 + }, + { + "epoch": 2.3492063492063493, + "grad_norm": 0.20236613701883793, + "learning_rate": 1.2073696589572717e-05, + "loss": 0.3492, + "step": 2220 + }, + { + "epoch": 2.3502645502645505, + "grad_norm": 0.1966092544544384, + "learning_rate": 1.2054096432771463e-05, + "loss": 0.3317, + "step": 2221 + }, + { + "epoch": 2.3513227513227513, + "grad_norm": 0.21784607574068482, + "learning_rate": 1.203449627597021e-05, + "loss": 0.3847, + "step": 2222 + }, + { + "epoch": 2.3523809523809525, + "grad_norm": 0.21084416216589955, + "learning_rate": 1.2014896119168955e-05, + "loss": 0.3637, + "step": 2223 + }, + { + "epoch": 2.3534391534391537, + "grad_norm": 0.1991773340459715, + "learning_rate": 1.19952959623677e-05, + "loss": 0.3303, + "step": 2224 + }, + { + "epoch": 2.3544973544973544, + "grad_norm": 0.1918709626897971, + "learning_rate": 1.1975695805566445e-05, + "loss": 0.3025, + "step": 2225 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.19838360255526297, + "learning_rate": 1.1956095648765191e-05, + "loss": 0.3105, + "step": 2226 + }, + { + "epoch": 2.356613756613757, + "grad_norm": 0.20910709785260412, + "learning_rate": 1.1936495491963937e-05, + "loss": 0.3434, + "step": 2227 + }, + { + "epoch": 2.3576719576719576, + "grad_norm": 0.20813311118331637, + "learning_rate": 1.1916895335162683e-05, + "loss": 0.3388, + "step": 2228 + }, + { + "epoch": 2.358730158730159, + "grad_norm": 0.21726113463115548, + "learning_rate": 1.1897295178361427e-05, + "loss": 0.3551, + "step": 2229 + }, + { + "epoch": 2.35978835978836, + "grad_norm": 0.21011962247860588, + "learning_rate": 1.1877695021560173e-05, + "loss": 0.3509, + "step": 2230 + }, + { + "epoch": 2.3608465608465607, + "grad_norm": 0.1953218231695064, + "learning_rate": 1.1858094864758919e-05, + "loss": 0.319, + "step": 2231 + }, + { + "epoch": 2.361904761904762, + "grad_norm": 0.20596213011515244, + "learning_rate": 1.1838494707957665e-05, + "loss": 0.3517, + "step": 2232 + }, + { + "epoch": 2.362962962962963, + "grad_norm": 0.2126925193532187, + "learning_rate": 1.1818894551156409e-05, + "loss": 0.3767, + "step": 2233 + }, + { + "epoch": 2.364021164021164, + "grad_norm": 0.19743855767120733, + "learning_rate": 1.1799294394355155e-05, + "loss": 0.3302, + "step": 2234 + }, + { + "epoch": 2.365079365079365, + "grad_norm": 0.1867269633062068, + "learning_rate": 1.17796942375539e-05, + "loss": 0.3275, + "step": 2235 + }, + { + "epoch": 2.3661375661375663, + "grad_norm": 0.19798324638566683, + "learning_rate": 1.1760094080752646e-05, + "loss": 0.3358, + "step": 2236 + }, + { + "epoch": 2.367195767195767, + "grad_norm": 0.21204816572937232, + "learning_rate": 1.1740493923951392e-05, + "loss": 0.3557, + "step": 2237 + }, + { + "epoch": 2.3682539682539683, + "grad_norm": 0.20561285406488616, + "learning_rate": 1.1720893767150138e-05, + "loss": 0.3859, + "step": 2238 + }, + { + "epoch": 2.3693121693121695, + "grad_norm": 0.1844307143572627, + "learning_rate": 1.1701293610348884e-05, + "loss": 0.3093, + "step": 2239 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 0.18285268707845753, + "learning_rate": 1.168169345354763e-05, + "loss": 0.2879, + "step": 2240 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.18891596600216767, + "learning_rate": 1.1662093296746376e-05, + "loss": 0.3164, + "step": 2241 + }, + { + "epoch": 2.3724867724867726, + "grad_norm": 0.21443463118106196, + "learning_rate": 1.164249313994512e-05, + "loss": 0.3724, + "step": 2242 + }, + { + "epoch": 2.3735449735449734, + "grad_norm": 0.20092191567682116, + "learning_rate": 1.1622892983143866e-05, + "loss": 0.336, + "step": 2243 + }, + { + "epoch": 2.3746031746031746, + "grad_norm": 0.21661040764716843, + "learning_rate": 1.1603292826342612e-05, + "loss": 0.3457, + "step": 2244 + }, + { + "epoch": 2.375661375661376, + "grad_norm": 0.190041744765612, + "learning_rate": 1.1583692669541358e-05, + "loss": 0.3251, + "step": 2245 + }, + { + "epoch": 2.3767195767195766, + "grad_norm": 0.1886509935215561, + "learning_rate": 1.1564092512740103e-05, + "loss": 0.3268, + "step": 2246 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.19137796325706583, + "learning_rate": 1.1544492355938848e-05, + "loss": 0.3137, + "step": 2247 + }, + { + "epoch": 2.378835978835979, + "grad_norm": 0.19633048102603665, + "learning_rate": 1.1524892199137593e-05, + "loss": 0.3428, + "step": 2248 + }, + { + "epoch": 2.3798941798941797, + "grad_norm": 0.20298501174663242, + "learning_rate": 1.150529204233634e-05, + "loss": 0.3336, + "step": 2249 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.20915086314748638, + "learning_rate": 1.1485691885535085e-05, + "loss": 0.3538, + "step": 2250 + }, + { + "epoch": 2.382010582010582, + "grad_norm": 0.1959406139546007, + "learning_rate": 1.146609172873383e-05, + "loss": 0.3204, + "step": 2251 + }, + { + "epoch": 2.383068783068783, + "grad_norm": 0.1980356952193389, + "learning_rate": 1.1446491571932575e-05, + "loss": 0.3369, + "step": 2252 + }, + { + "epoch": 2.384126984126984, + "grad_norm": 0.21603527021327792, + "learning_rate": 1.1426891415131321e-05, + "loss": 0.4106, + "step": 2253 + }, + { + "epoch": 2.3851851851851853, + "grad_norm": 0.19272692236278238, + "learning_rate": 1.1407291258330067e-05, + "loss": 0.3393, + "step": 2254 + }, + { + "epoch": 2.386243386243386, + "grad_norm": 0.1956248707684031, + "learning_rate": 1.1387691101528813e-05, + "loss": 0.3411, + "step": 2255 + }, + { + "epoch": 2.3873015873015873, + "grad_norm": 0.19014685494467712, + "learning_rate": 1.1368090944727559e-05, + "loss": 0.3185, + "step": 2256 + }, + { + "epoch": 2.3883597883597885, + "grad_norm": 0.20447615014158566, + "learning_rate": 1.1348490787926305e-05, + "loss": 0.3513, + "step": 2257 + }, + { + "epoch": 2.389417989417989, + "grad_norm": 0.21079536892016623, + "learning_rate": 1.132889063112505e-05, + "loss": 0.3377, + "step": 2258 + }, + { + "epoch": 2.3904761904761904, + "grad_norm": 0.20403855917854744, + "learning_rate": 1.1309290474323796e-05, + "loss": 0.3728, + "step": 2259 + }, + { + "epoch": 2.3915343915343916, + "grad_norm": 0.19160044266304876, + "learning_rate": 1.128969031752254e-05, + "loss": 0.3113, + "step": 2260 + }, + { + "epoch": 2.3925925925925924, + "grad_norm": 0.2081785958094779, + "learning_rate": 1.1270090160721286e-05, + "loss": 0.3618, + "step": 2261 + }, + { + "epoch": 2.3936507936507936, + "grad_norm": 0.2080027481507071, + "learning_rate": 1.1250490003920032e-05, + "loss": 0.3484, + "step": 2262 + }, + { + "epoch": 2.394708994708995, + "grad_norm": 0.2048194108111222, + "learning_rate": 1.1230889847118778e-05, + "loss": 0.3429, + "step": 2263 + }, + { + "epoch": 2.3957671957671955, + "grad_norm": 0.2103500941268802, + "learning_rate": 1.1211289690317522e-05, + "loss": 0.3665, + "step": 2264 + }, + { + "epoch": 2.3968253968253967, + "grad_norm": 0.21252569969121238, + "learning_rate": 1.1191689533516268e-05, + "loss": 0.3346, + "step": 2265 + }, + { + "epoch": 2.397883597883598, + "grad_norm": 0.19587149638160067, + "learning_rate": 1.1172089376715014e-05, + "loss": 0.3352, + "step": 2266 + }, + { + "epoch": 2.3989417989417987, + "grad_norm": 0.21361691084135886, + "learning_rate": 1.115248921991376e-05, + "loss": 0.3695, + "step": 2267 + }, + { + "epoch": 2.4, + "grad_norm": 0.1860233174405519, + "learning_rate": 1.1132889063112506e-05, + "loss": 0.3049, + "step": 2268 + }, + { + "epoch": 2.401058201058201, + "grad_norm": 0.21531066517258854, + "learning_rate": 1.111328890631125e-05, + "loss": 0.3919, + "step": 2269 + }, + { + "epoch": 2.402116402116402, + "grad_norm": 0.20973338346864226, + "learning_rate": 1.1093688749509996e-05, + "loss": 0.309, + "step": 2270 + }, + { + "epoch": 2.403174603174603, + "grad_norm": 0.19942109842684344, + "learning_rate": 1.1074088592708742e-05, + "loss": 0.3438, + "step": 2271 + }, + { + "epoch": 2.4042328042328043, + "grad_norm": 0.20332330726245493, + "learning_rate": 1.1054488435907488e-05, + "loss": 0.3716, + "step": 2272 + }, + { + "epoch": 2.4052910052910055, + "grad_norm": 0.2109034509123887, + "learning_rate": 1.1034888279106234e-05, + "loss": 0.3815, + "step": 2273 + }, + { + "epoch": 2.4063492063492062, + "grad_norm": 0.20967094697117655, + "learning_rate": 1.101528812230498e-05, + "loss": 0.3791, + "step": 2274 + }, + { + "epoch": 2.4074074074074074, + "grad_norm": 0.20137962520759609, + "learning_rate": 1.0995687965503725e-05, + "loss": 0.352, + "step": 2275 + }, + { + "epoch": 2.4084656084656086, + "grad_norm": 0.1959407595415704, + "learning_rate": 1.0976087808702471e-05, + "loss": 0.3128, + "step": 2276 + }, + { + "epoch": 2.4095238095238094, + "grad_norm": 0.2025181599605024, + "learning_rate": 1.0956487651901217e-05, + "loss": 0.3643, + "step": 2277 + }, + { + "epoch": 2.4105820105820106, + "grad_norm": 0.230544012868408, + "learning_rate": 1.0936887495099961e-05, + "loss": 0.3726, + "step": 2278 + }, + { + "epoch": 2.411640211640212, + "grad_norm": 0.1990066422140862, + "learning_rate": 1.0917287338298707e-05, + "loss": 0.3638, + "step": 2279 + }, + { + "epoch": 2.4126984126984126, + "grad_norm": 0.18998917405534244, + "learning_rate": 1.0897687181497453e-05, + "loss": 0.3274, + "step": 2280 + }, + { + "epoch": 2.4137566137566138, + "grad_norm": 0.20400621001125108, + "learning_rate": 1.0878087024696199e-05, + "loss": 0.3977, + "step": 2281 + }, + { + "epoch": 2.414814814814815, + "grad_norm": 0.19362884565369348, + "learning_rate": 1.0858486867894943e-05, + "loss": 0.2982, + "step": 2282 + }, + { + "epoch": 2.4158730158730157, + "grad_norm": 0.19304320410771325, + "learning_rate": 1.0838886711093689e-05, + "loss": 0.3363, + "step": 2283 + }, + { + "epoch": 2.416931216931217, + "grad_norm": 0.18681427914983176, + "learning_rate": 1.0819286554292435e-05, + "loss": 0.2841, + "step": 2284 + }, + { + "epoch": 2.417989417989418, + "grad_norm": 0.20184125426045912, + "learning_rate": 1.079968639749118e-05, + "loss": 0.3428, + "step": 2285 + }, + { + "epoch": 2.419047619047619, + "grad_norm": 0.2037706142983145, + "learning_rate": 1.0780086240689927e-05, + "loss": 0.3649, + "step": 2286 + }, + { + "epoch": 2.42010582010582, + "grad_norm": 0.19817399493675728, + "learning_rate": 1.076048608388867e-05, + "loss": 0.3506, + "step": 2287 + }, + { + "epoch": 2.4211640211640213, + "grad_norm": 0.20169589584258404, + "learning_rate": 1.0740885927087417e-05, + "loss": 0.357, + "step": 2288 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 0.20942728369583866, + "learning_rate": 1.0721285770286162e-05, + "loss": 0.3732, + "step": 2289 + }, + { + "epoch": 2.4232804232804233, + "grad_norm": 0.19265517759072814, + "learning_rate": 1.0701685613484908e-05, + "loss": 0.3176, + "step": 2290 + }, + { + "epoch": 2.4243386243386245, + "grad_norm": 0.1982086084106544, + "learning_rate": 1.0682085456683654e-05, + "loss": 0.3467, + "step": 2291 + }, + { + "epoch": 2.425396825396825, + "grad_norm": 0.19447098693465864, + "learning_rate": 1.06624852998824e-05, + "loss": 0.3299, + "step": 2292 + }, + { + "epoch": 2.4264550264550264, + "grad_norm": 0.20888212480507384, + "learning_rate": 1.0642885143081146e-05, + "loss": 0.3459, + "step": 2293 + }, + { + "epoch": 2.4275132275132276, + "grad_norm": 0.208144303971245, + "learning_rate": 1.0623284986279892e-05, + "loss": 0.367, + "step": 2294 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.20876100051287758, + "learning_rate": 1.0603684829478638e-05, + "loss": 0.3745, + "step": 2295 + }, + { + "epoch": 2.4296296296296296, + "grad_norm": 0.20791252154146914, + "learning_rate": 1.0584084672677382e-05, + "loss": 0.3491, + "step": 2296 + }, + { + "epoch": 2.430687830687831, + "grad_norm": 0.19777559356235025, + "learning_rate": 1.0564484515876128e-05, + "loss": 0.3374, + "step": 2297 + }, + { + "epoch": 2.431746031746032, + "grad_norm": 0.19110723755738349, + "learning_rate": 1.0544884359074874e-05, + "loss": 0.3271, + "step": 2298 + }, + { + "epoch": 2.4328042328042327, + "grad_norm": 0.2008378278505105, + "learning_rate": 1.052528420227362e-05, + "loss": 0.3387, + "step": 2299 + }, + { + "epoch": 2.433862433862434, + "grad_norm": 0.21027780382458788, + "learning_rate": 1.0505684045472364e-05, + "loss": 0.3179, + "step": 2300 + }, + { + "epoch": 2.434920634920635, + "grad_norm": 0.2026493429086862, + "learning_rate": 1.048608388867111e-05, + "loss": 0.3345, + "step": 2301 + }, + { + "epoch": 2.435978835978836, + "grad_norm": 0.20274404356172807, + "learning_rate": 1.0466483731869855e-05, + "loss": 0.3801, + "step": 2302 + }, + { + "epoch": 2.437037037037037, + "grad_norm": 0.19273549407390878, + "learning_rate": 1.0446883575068601e-05, + "loss": 0.3467, + "step": 2303 + }, + { + "epoch": 2.4380952380952383, + "grad_norm": 0.1818577084303832, + "learning_rate": 1.0427283418267345e-05, + "loss": 0.3202, + "step": 2304 + }, + { + "epoch": 2.439153439153439, + "grad_norm": 0.19560068167078504, + "learning_rate": 1.0407683261466091e-05, + "loss": 0.3273, + "step": 2305 + }, + { + "epoch": 2.4402116402116403, + "grad_norm": 0.20370517607190086, + "learning_rate": 1.0388083104664837e-05, + "loss": 0.3403, + "step": 2306 + }, + { + "epoch": 2.4412698412698415, + "grad_norm": 0.20114205886933845, + "learning_rate": 1.0368482947863583e-05, + "loss": 0.3522, + "step": 2307 + }, + { + "epoch": 2.4423280423280422, + "grad_norm": 0.20882464854711777, + "learning_rate": 1.0348882791062329e-05, + "loss": 0.3666, + "step": 2308 + }, + { + "epoch": 2.4433862433862434, + "grad_norm": 0.20438756339590045, + "learning_rate": 1.0329282634261075e-05, + "loss": 0.3435, + "step": 2309 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.22064605217007735, + "learning_rate": 1.030968247745982e-05, + "loss": 0.4142, + "step": 2310 + }, + { + "epoch": 2.4455026455026454, + "grad_norm": 0.1817014882330108, + "learning_rate": 1.0290082320658567e-05, + "loss": 0.2751, + "step": 2311 + }, + { + "epoch": 2.4465608465608466, + "grad_norm": 0.19417363691429626, + "learning_rate": 1.0270482163857312e-05, + "loss": 0.347, + "step": 2312 + }, + { + "epoch": 2.447619047619048, + "grad_norm": 0.19984236991627735, + "learning_rate": 1.0250882007056057e-05, + "loss": 0.358, + "step": 2313 + }, + { + "epoch": 2.4486772486772486, + "grad_norm": 0.20561224905848144, + "learning_rate": 1.0231281850254803e-05, + "loss": 0.3748, + "step": 2314 + }, + { + "epoch": 2.4497354497354498, + "grad_norm": 0.2030555882275226, + "learning_rate": 1.0211681693453548e-05, + "loss": 0.3333, + "step": 2315 + }, + { + "epoch": 2.450793650793651, + "grad_norm": 0.18955391416822132, + "learning_rate": 1.0192081536652294e-05, + "loss": 0.3067, + "step": 2316 + }, + { + "epoch": 2.4518518518518517, + "grad_norm": 0.214593330571037, + "learning_rate": 1.017248137985104e-05, + "loss": 0.4056, + "step": 2317 + }, + { + "epoch": 2.452910052910053, + "grad_norm": 0.20084964435456928, + "learning_rate": 1.0152881223049784e-05, + "loss": 0.3302, + "step": 2318 + }, + { + "epoch": 2.453968253968254, + "grad_norm": 0.19398682980431026, + "learning_rate": 1.013328106624853e-05, + "loss": 0.3213, + "step": 2319 + }, + { + "epoch": 2.455026455026455, + "grad_norm": 0.22095190181637556, + "learning_rate": 1.0113680909447276e-05, + "loss": 0.4104, + "step": 2320 + }, + { + "epoch": 2.456084656084656, + "grad_norm": 0.21142265935421076, + "learning_rate": 1.0094080752646022e-05, + "loss": 0.3691, + "step": 2321 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.20774600162711074, + "learning_rate": 1.0074480595844766e-05, + "loss": 0.3576, + "step": 2322 + }, + { + "epoch": 2.458201058201058, + "grad_norm": 0.19470750734709114, + "learning_rate": 1.0054880439043512e-05, + "loss": 0.3391, + "step": 2323 + }, + { + "epoch": 2.4592592592592593, + "grad_norm": 0.20179308051014488, + "learning_rate": 1.0035280282242258e-05, + "loss": 0.3753, + "step": 2324 + }, + { + "epoch": 2.4603174603174605, + "grad_norm": 0.1964229149495673, + "learning_rate": 1.0015680125441004e-05, + "loss": 0.323, + "step": 2325 + }, + { + "epoch": 2.461375661375661, + "grad_norm": 0.20063757362174908, + "learning_rate": 9.99607996863975e-06, + "loss": 0.3787, + "step": 2326 + }, + { + "epoch": 2.4624338624338624, + "grad_norm": 0.1930747913002795, + "learning_rate": 9.976479811838496e-06, + "loss": 0.319, + "step": 2327 + }, + { + "epoch": 2.4634920634920636, + "grad_norm": 0.21711063943337874, + "learning_rate": 9.956879655037241e-06, + "loss": 0.3542, + "step": 2328 + }, + { + "epoch": 2.4645502645502644, + "grad_norm": 0.1903790970553673, + "learning_rate": 9.937279498235987e-06, + "loss": 0.3267, + "step": 2329 + }, + { + "epoch": 2.4656084656084656, + "grad_norm": 0.3974441833827003, + "learning_rate": 9.917679341434733e-06, + "loss": 0.3863, + "step": 2330 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.21646430718812706, + "learning_rate": 9.898079184633477e-06, + "loss": 0.3876, + "step": 2331 + }, + { + "epoch": 2.4677248677248675, + "grad_norm": 0.22008541249629765, + "learning_rate": 9.878479027832223e-06, + "loss": 0.328, + "step": 2332 + }, + { + "epoch": 2.4687830687830687, + "grad_norm": 0.19270815027961807, + "learning_rate": 9.858878871030969e-06, + "loss": 0.3422, + "step": 2333 + }, + { + "epoch": 2.46984126984127, + "grad_norm": 0.2002872486600827, + "learning_rate": 9.839278714229715e-06, + "loss": 0.3509, + "step": 2334 + }, + { + "epoch": 2.4708994708994707, + "grad_norm": 0.216151009251505, + "learning_rate": 9.81967855742846e-06, + "loss": 0.3872, + "step": 2335 + }, + { + "epoch": 2.471957671957672, + "grad_norm": 0.21555323390899783, + "learning_rate": 9.800078400627205e-06, + "loss": 0.3542, + "step": 2336 + }, + { + "epoch": 2.473015873015873, + "grad_norm": 0.18992518981588918, + "learning_rate": 9.780478243825951e-06, + "loss": 0.3061, + "step": 2337 + }, + { + "epoch": 2.474074074074074, + "grad_norm": 0.20207716846236165, + "learning_rate": 9.760878087024697e-06, + "loss": 0.3492, + "step": 2338 + }, + { + "epoch": 2.475132275132275, + "grad_norm": 0.3238919978391247, + "learning_rate": 9.741277930223443e-06, + "loss": 0.3811, + "step": 2339 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 0.20016631554309963, + "learning_rate": 9.721677773422187e-06, + "loss": 0.3375, + "step": 2340 + }, + { + "epoch": 2.477248677248677, + "grad_norm": 0.2099603304873576, + "learning_rate": 9.702077616620933e-06, + "loss": 0.3601, + "step": 2341 + }, + { + "epoch": 2.4783068783068782, + "grad_norm": 0.2163092177109193, + "learning_rate": 9.682477459819679e-06, + "loss": 0.3929, + "step": 2342 + }, + { + "epoch": 2.4793650793650794, + "grad_norm": 0.195942799842103, + "learning_rate": 9.662877303018424e-06, + "loss": 0.3259, + "step": 2343 + }, + { + "epoch": 2.48042328042328, + "grad_norm": 0.20028542223736842, + "learning_rate": 9.64327714621717e-06, + "loss": 0.3195, + "step": 2344 + }, + { + "epoch": 2.4814814814814814, + "grad_norm": 0.2034772057737831, + "learning_rate": 9.623676989415916e-06, + "loss": 0.3481, + "step": 2345 + }, + { + "epoch": 2.4825396825396826, + "grad_norm": 0.19210503275161703, + "learning_rate": 9.604076832614662e-06, + "loss": 0.3252, + "step": 2346 + }, + { + "epoch": 2.4835978835978834, + "grad_norm": 0.2096974621839747, + "learning_rate": 9.584476675813408e-06, + "loss": 0.3513, + "step": 2347 + }, + { + "epoch": 2.4846560846560846, + "grad_norm": 0.20429072459845538, + "learning_rate": 9.564876519012154e-06, + "loss": 0.3549, + "step": 2348 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 0.18861327099569755, + "learning_rate": 9.545276362210898e-06, + "loss": 0.3381, + "step": 2349 + }, + { + "epoch": 2.4867724867724865, + "grad_norm": 0.19260260318882602, + "learning_rate": 9.525676205409644e-06, + "loss": 0.3248, + "step": 2350 + }, + { + "epoch": 2.4878306878306877, + "grad_norm": 0.2121251443373568, + "learning_rate": 9.50607604860839e-06, + "loss": 0.3893, + "step": 2351 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.2153555638932981, + "learning_rate": 9.486475891807136e-06, + "loss": 0.4048, + "step": 2352 + }, + { + "epoch": 2.48994708994709, + "grad_norm": 0.21789073463498923, + "learning_rate": 9.46687573500588e-06, + "loss": 0.3747, + "step": 2353 + }, + { + "epoch": 2.491005291005291, + "grad_norm": 0.20505225693714865, + "learning_rate": 9.447275578204626e-06, + "loss": 0.3438, + "step": 2354 + }, + { + "epoch": 2.492063492063492, + "grad_norm": 0.19519515384302688, + "learning_rate": 9.427675421403372e-06, + "loss": 0.324, + "step": 2355 + }, + { + "epoch": 2.4931216931216933, + "grad_norm": 0.19903623860168693, + "learning_rate": 9.408075264602117e-06, + "loss": 0.3503, + "step": 2356 + }, + { + "epoch": 2.494179894179894, + "grad_norm": 0.2075371111288816, + "learning_rate": 9.388475107800863e-06, + "loss": 0.3645, + "step": 2357 + }, + { + "epoch": 2.4952380952380953, + "grad_norm": 0.20375379109475755, + "learning_rate": 9.368874950999607e-06, + "loss": 0.322, + "step": 2358 + }, + { + "epoch": 2.4962962962962965, + "grad_norm": 0.22033682656514744, + "learning_rate": 9.349274794198353e-06, + "loss": 0.3896, + "step": 2359 + }, + { + "epoch": 2.497354497354497, + "grad_norm": 0.19640384262879929, + "learning_rate": 9.3296746373971e-06, + "loss": 0.312, + "step": 2360 + }, + { + "epoch": 2.4984126984126984, + "grad_norm": 0.2081466305517114, + "learning_rate": 9.310074480595845e-06, + "loss": 0.3352, + "step": 2361 + }, + { + "epoch": 2.4994708994708996, + "grad_norm": 0.1999811658493017, + "learning_rate": 9.290474323794591e-06, + "loss": 0.34, + "step": 2362 + }, + { + "epoch": 2.5005291005291004, + "grad_norm": 0.20242709998710268, + "learning_rate": 9.270874166993337e-06, + "loss": 0.324, + "step": 2363 + }, + { + "epoch": 2.5015873015873016, + "grad_norm": 0.1890758571530466, + "learning_rate": 9.251274010192083e-06, + "loss": 0.3205, + "step": 2364 + }, + { + "epoch": 2.502645502645503, + "grad_norm": 0.2018935760610972, + "learning_rate": 9.231673853390829e-06, + "loss": 0.3512, + "step": 2365 + }, + { + "epoch": 2.5037037037037035, + "grad_norm": 0.1905973585991993, + "learning_rate": 9.212073696589574e-06, + "loss": 0.3184, + "step": 2366 + }, + { + "epoch": 2.5047619047619047, + "grad_norm": 0.20119197145153586, + "learning_rate": 9.192473539788319e-06, + "loss": 0.3715, + "step": 2367 + }, + { + "epoch": 2.505820105820106, + "grad_norm": 0.22913278918931926, + "learning_rate": 9.172873382987065e-06, + "loss": 0.3569, + "step": 2368 + }, + { + "epoch": 2.506878306878307, + "grad_norm": 0.21451859754659375, + "learning_rate": 9.15327322618581e-06, + "loss": 0.3475, + "step": 2369 + }, + { + "epoch": 2.507936507936508, + "grad_norm": 0.18032408092733698, + "learning_rate": 9.133673069384556e-06, + "loss": 0.2882, + "step": 2370 + }, + { + "epoch": 2.508994708994709, + "grad_norm": 0.18785546074799372, + "learning_rate": 9.1140729125833e-06, + "loss": 0.321, + "step": 2371 + }, + { + "epoch": 2.5100529100529103, + "grad_norm": 0.2081663751474157, + "learning_rate": 9.094472755782046e-06, + "loss": 0.37, + "step": 2372 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 0.19467624754830543, + "learning_rate": 9.074872598980792e-06, + "loss": 0.3151, + "step": 2373 + }, + { + "epoch": 2.5121693121693123, + "grad_norm": 0.21111318671913534, + "learning_rate": 9.055272442179538e-06, + "loss": 0.3559, + "step": 2374 + }, + { + "epoch": 2.5132275132275135, + "grad_norm": 0.20158857860626356, + "learning_rate": 9.035672285378284e-06, + "loss": 0.3413, + "step": 2375 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.19967004938402297, + "learning_rate": 9.016072128577028e-06, + "loss": 0.324, + "step": 2376 + }, + { + "epoch": 2.5153439153439154, + "grad_norm": 0.20043339966519105, + "learning_rate": 8.996471971775774e-06, + "loss": 0.3575, + "step": 2377 + }, + { + "epoch": 2.5164021164021166, + "grad_norm": 0.2090345217187549, + "learning_rate": 8.97687181497452e-06, + "loss": 0.3647, + "step": 2378 + }, + { + "epoch": 2.5174603174603174, + "grad_norm": 0.19573505746191802, + "learning_rate": 8.957271658173266e-06, + "loss": 0.3559, + "step": 2379 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 0.2108974750518579, + "learning_rate": 8.937671501372012e-06, + "loss": 0.3635, + "step": 2380 + }, + { + "epoch": 2.51957671957672, + "grad_norm": 0.200163482912999, + "learning_rate": 8.918071344570758e-06, + "loss": 0.3469, + "step": 2381 + }, + { + "epoch": 2.5206349206349206, + "grad_norm": 0.20864956888060593, + "learning_rate": 8.898471187769503e-06, + "loss": 0.3657, + "step": 2382 + }, + { + "epoch": 2.5216931216931218, + "grad_norm": 0.19842095672056884, + "learning_rate": 8.87887103096825e-06, + "loss": 0.3738, + "step": 2383 + }, + { + "epoch": 2.522751322751323, + "grad_norm": 0.2135741099116242, + "learning_rate": 8.859270874166995e-06, + "loss": 0.4217, + "step": 2384 + }, + { + "epoch": 2.5238095238095237, + "grad_norm": 0.1869862686126683, + "learning_rate": 8.83967071736574e-06, + "loss": 0.3287, + "step": 2385 + }, + { + "epoch": 2.524867724867725, + "grad_norm": 0.2079464545823666, + "learning_rate": 8.820070560564485e-06, + "loss": 0.3214, + "step": 2386 + }, + { + "epoch": 2.525925925925926, + "grad_norm": 0.2196057878276039, + "learning_rate": 8.800470403763231e-06, + "loss": 0.3939, + "step": 2387 + }, + { + "epoch": 2.526984126984127, + "grad_norm": 0.20564907375985877, + "learning_rate": 8.780870246961977e-06, + "loss": 0.3698, + "step": 2388 + }, + { + "epoch": 2.528042328042328, + "grad_norm": 0.18394751990532987, + "learning_rate": 8.761270090160721e-06, + "loss": 0.293, + "step": 2389 + }, + { + "epoch": 2.5291005291005293, + "grad_norm": 0.21363483672053832, + "learning_rate": 8.741669933359467e-06, + "loss": 0.303, + "step": 2390 + }, + { + "epoch": 2.53015873015873, + "grad_norm": 0.20780527897074721, + "learning_rate": 8.722069776558213e-06, + "loss": 0.3877, + "step": 2391 + }, + { + "epoch": 2.5312169312169313, + "grad_norm": 0.19783156319626172, + "learning_rate": 8.702469619756959e-06, + "loss": 0.3512, + "step": 2392 + }, + { + "epoch": 2.5322751322751325, + "grad_norm": 0.1892822612166525, + "learning_rate": 8.682869462955705e-06, + "loss": 0.3275, + "step": 2393 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.19170942956582057, + "learning_rate": 8.663269306154449e-06, + "loss": 0.3128, + "step": 2394 + }, + { + "epoch": 2.5343915343915344, + "grad_norm": 0.21434651400329688, + "learning_rate": 8.643669149353195e-06, + "loss": 0.371, + "step": 2395 + }, + { + "epoch": 2.5354497354497356, + "grad_norm": 0.20111551143204334, + "learning_rate": 8.62406899255194e-06, + "loss": 0.3564, + "step": 2396 + }, + { + "epoch": 2.5365079365079364, + "grad_norm": 0.18854536443130432, + "learning_rate": 8.604468835750686e-06, + "loss": 0.3282, + "step": 2397 + }, + { + "epoch": 2.5375661375661376, + "grad_norm": 0.2024488750022803, + "learning_rate": 8.584868678949432e-06, + "loss": 0.3738, + "step": 2398 + }, + { + "epoch": 2.538624338624339, + "grad_norm": 0.20611891154190004, + "learning_rate": 8.565268522148178e-06, + "loss": 0.3811, + "step": 2399 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.20495838632725324, + "learning_rate": 8.545668365346924e-06, + "loss": 0.3613, + "step": 2400 + }, + { + "epoch": 2.5407407407407407, + "grad_norm": 0.19072363427225575, + "learning_rate": 8.52606820854567e-06, + "loss": 0.3369, + "step": 2401 + }, + { + "epoch": 2.541798941798942, + "grad_norm": 0.19789802048128188, + "learning_rate": 8.506468051744414e-06, + "loss": 0.3239, + "step": 2402 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.21881605201200133, + "learning_rate": 8.48686789494316e-06, + "loss": 0.3218, + "step": 2403 + }, + { + "epoch": 2.543915343915344, + "grad_norm": 0.20069563327958265, + "learning_rate": 8.467267738141906e-06, + "loss": 0.3424, + "step": 2404 + }, + { + "epoch": 2.544973544973545, + "grad_norm": 0.2075442824382541, + "learning_rate": 8.447667581340652e-06, + "loss": 0.3579, + "step": 2405 + }, + { + "epoch": 2.546031746031746, + "grad_norm": 0.20209877990797678, + "learning_rate": 8.428067424539398e-06, + "loss": 0.3658, + "step": 2406 + }, + { + "epoch": 2.547089947089947, + "grad_norm": 0.20347999261766514, + "learning_rate": 8.408467267738142e-06, + "loss": 0.3624, + "step": 2407 + }, + { + "epoch": 2.5481481481481483, + "grad_norm": 0.19601569897380608, + "learning_rate": 8.388867110936888e-06, + "loss": 0.3342, + "step": 2408 + }, + { + "epoch": 2.549206349206349, + "grad_norm": 0.20238610460423004, + "learning_rate": 8.369266954135634e-06, + "loss": 0.3509, + "step": 2409 + }, + { + "epoch": 2.5502645502645502, + "grad_norm": 0.20063742492135442, + "learning_rate": 8.34966679733438e-06, + "loss": 0.3398, + "step": 2410 + }, + { + "epoch": 2.5513227513227514, + "grad_norm": 0.20819913291416195, + "learning_rate": 8.330066640533124e-06, + "loss": 0.3712, + "step": 2411 + }, + { + "epoch": 2.552380952380952, + "grad_norm": 0.20770506244650877, + "learning_rate": 8.31046648373187e-06, + "loss": 0.3702, + "step": 2412 + }, + { + "epoch": 2.5534391534391534, + "grad_norm": 0.1908698216977231, + "learning_rate": 8.290866326930615e-06, + "loss": 0.3442, + "step": 2413 + }, + { + "epoch": 2.5544973544973546, + "grad_norm": 0.22309789081285533, + "learning_rate": 8.271266170129361e-06, + "loss": 0.3393, + "step": 2414 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.2047928716854341, + "learning_rate": 8.251666013328107e-06, + "loss": 0.3602, + "step": 2415 + }, + { + "epoch": 2.5566137566137566, + "grad_norm": 0.19373128203836848, + "learning_rate": 8.232065856526853e-06, + "loss": 0.3192, + "step": 2416 + }, + { + "epoch": 2.5576719576719578, + "grad_norm": 0.2243631469556603, + "learning_rate": 8.212465699725599e-06, + "loss": 0.3471, + "step": 2417 + }, + { + "epoch": 2.5587301587301585, + "grad_norm": 0.2030278112277129, + "learning_rate": 8.192865542924345e-06, + "loss": 0.2942, + "step": 2418 + }, + { + "epoch": 2.5597883597883597, + "grad_norm": 0.21497459789916237, + "learning_rate": 8.17326538612309e-06, + "loss": 0.4136, + "step": 2419 + }, + { + "epoch": 2.560846560846561, + "grad_norm": 0.22645848971121996, + "learning_rate": 8.153665229321835e-06, + "loss": 0.3705, + "step": 2420 + }, + { + "epoch": 2.5619047619047617, + "grad_norm": 0.18927547831969765, + "learning_rate": 8.13406507252058e-06, + "loss": 0.287, + "step": 2421 + }, + { + "epoch": 2.562962962962963, + "grad_norm": 0.19829367281019175, + "learning_rate": 8.114464915719327e-06, + "loss": 0.3342, + "step": 2422 + }, + { + "epoch": 2.564021164021164, + "grad_norm": 0.21043939582778504, + "learning_rate": 8.094864758918072e-06, + "loss": 0.371, + "step": 2423 + }, + { + "epoch": 2.565079365079365, + "grad_norm": 0.21176652701168425, + "learning_rate": 8.075264602116818e-06, + "loss": 0.361, + "step": 2424 + }, + { + "epoch": 2.566137566137566, + "grad_norm": 0.19858482433563368, + "learning_rate": 8.055664445315562e-06, + "loss": 0.3435, + "step": 2425 + }, + { + "epoch": 2.5671957671957673, + "grad_norm": 0.1930389532486009, + "learning_rate": 8.036064288514308e-06, + "loss": 0.3116, + "step": 2426 + }, + { + "epoch": 2.568253968253968, + "grad_norm": 0.2017032704364643, + "learning_rate": 8.016464131713054e-06, + "loss": 0.3526, + "step": 2427 + }, + { + "epoch": 2.569312169312169, + "grad_norm": 0.2100015873501434, + "learning_rate": 7.9968639749118e-06, + "loss": 0.3647, + "step": 2428 + }, + { + "epoch": 2.5703703703703704, + "grad_norm": 0.20235907257311597, + "learning_rate": 7.977263818110544e-06, + "loss": 0.3405, + "step": 2429 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.19998918287012984, + "learning_rate": 7.95766366130929e-06, + "loss": 0.376, + "step": 2430 + }, + { + "epoch": 2.5724867724867724, + "grad_norm": 0.19691447139385335, + "learning_rate": 7.938063504508036e-06, + "loss": 0.3494, + "step": 2431 + }, + { + "epoch": 2.5735449735449736, + "grad_norm": 0.2135251585389222, + "learning_rate": 7.918463347706782e-06, + "loss": 0.3532, + "step": 2432 + }, + { + "epoch": 2.5746031746031743, + "grad_norm": 0.201995066867421, + "learning_rate": 7.898863190905528e-06, + "loss": 0.3443, + "step": 2433 + }, + { + "epoch": 2.5756613756613755, + "grad_norm": 0.18780759974783323, + "learning_rate": 7.879263034104274e-06, + "loss": 0.3067, + "step": 2434 + }, + { + "epoch": 2.5767195767195767, + "grad_norm": 0.19161770345353862, + "learning_rate": 7.85966287730302e-06, + "loss": 0.3165, + "step": 2435 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 2.4738622305406635, + "learning_rate": 7.840062720501765e-06, + "loss": 0.4718, + "step": 2436 + }, + { + "epoch": 2.5788359788359787, + "grad_norm": 0.2163467306235148, + "learning_rate": 7.820462563700511e-06, + "loss": 0.3272, + "step": 2437 + }, + { + "epoch": 2.57989417989418, + "grad_norm": 0.20305099508166402, + "learning_rate": 7.800862406899255e-06, + "loss": 0.3354, + "step": 2438 + }, + { + "epoch": 2.580952380952381, + "grad_norm": 0.2251581241069781, + "learning_rate": 7.781262250098001e-06, + "loss": 0.3525, + "step": 2439 + }, + { + "epoch": 2.582010582010582, + "grad_norm": 0.20829738004575854, + "learning_rate": 7.761662093296747e-06, + "loss": 0.357, + "step": 2440 + }, + { + "epoch": 2.583068783068783, + "grad_norm": 0.20453941398613518, + "learning_rate": 7.742061936495493e-06, + "loss": 0.3655, + "step": 2441 + }, + { + "epoch": 2.5841269841269843, + "grad_norm": 0.2084221126285046, + "learning_rate": 7.722461779694237e-06, + "loss": 0.3545, + "step": 2442 + }, + { + "epoch": 2.585185185185185, + "grad_norm": 0.20422846872501063, + "learning_rate": 7.702861622892983e-06, + "loss": 0.3505, + "step": 2443 + }, + { + "epoch": 2.5862433862433862, + "grad_norm": 0.23322575042300528, + "learning_rate": 7.683261466091729e-06, + "loss": 0.4038, + "step": 2444 + }, + { + "epoch": 2.5873015873015874, + "grad_norm": 0.2149540741964415, + "learning_rate": 7.663661309290475e-06, + "loss": 0.365, + "step": 2445 + }, + { + "epoch": 2.588359788359788, + "grad_norm": 0.23658255878401824, + "learning_rate": 7.64406115248922e-06, + "loss": 0.3352, + "step": 2446 + }, + { + "epoch": 2.5894179894179894, + "grad_norm": 0.193188372100089, + "learning_rate": 7.624460995687966e-06, + "loss": 0.3533, + "step": 2447 + }, + { + "epoch": 2.5904761904761906, + "grad_norm": 0.2156461994282791, + "learning_rate": 7.604860838886712e-06, + "loss": 0.3092, + "step": 2448 + }, + { + "epoch": 2.591534391534392, + "grad_norm": 0.21459254756654236, + "learning_rate": 7.5852606820854575e-06, + "loss": 0.371, + "step": 2449 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.20067238928848508, + "learning_rate": 7.565660525284203e-06, + "loss": 0.3334, + "step": 2450 + }, + { + "epoch": 2.5936507936507938, + "grad_norm": 0.20761937713286907, + "learning_rate": 7.5460603684829476e-06, + "loss": 0.348, + "step": 2451 + }, + { + "epoch": 2.594708994708995, + "grad_norm": 0.19344185405933195, + "learning_rate": 7.5264602116816934e-06, + "loss": 0.3343, + "step": 2452 + }, + { + "epoch": 2.5957671957671957, + "grad_norm": 0.20457097110759648, + "learning_rate": 7.506860054880439e-06, + "loss": 0.3588, + "step": 2453 + }, + { + "epoch": 2.596825396825397, + "grad_norm": 0.19256191638601902, + "learning_rate": 7.487259898079185e-06, + "loss": 0.3044, + "step": 2454 + }, + { + "epoch": 2.597883597883598, + "grad_norm": 0.19715791865716, + "learning_rate": 7.467659741277931e-06, + "loss": 0.3438, + "step": 2455 + }, + { + "epoch": 2.598941798941799, + "grad_norm": 0.20066467071473018, + "learning_rate": 7.448059584476676e-06, + "loss": 0.3708, + "step": 2456 + }, + { + "epoch": 2.6, + "grad_norm": 0.2061072533569407, + "learning_rate": 7.428459427675422e-06, + "loss": 0.365, + "step": 2457 + }, + { + "epoch": 2.6010582010582013, + "grad_norm": 0.190635839627958, + "learning_rate": 7.408859270874168e-06, + "loss": 0.3185, + "step": 2458 + }, + { + "epoch": 2.602116402116402, + "grad_norm": 0.21002157908314345, + "learning_rate": 7.389259114072914e-06, + "loss": 0.3322, + "step": 2459 + }, + { + "epoch": 2.6031746031746033, + "grad_norm": 0.18713946271024856, + "learning_rate": 7.369658957271658e-06, + "loss": 0.3126, + "step": 2460 + }, + { + "epoch": 2.6042328042328045, + "grad_norm": 0.2132276955579535, + "learning_rate": 7.350058800470404e-06, + "loss": 0.3714, + "step": 2461 + }, + { + "epoch": 2.605291005291005, + "grad_norm": 0.1976682334299022, + "learning_rate": 7.33045864366915e-06, + "loss": 0.3446, + "step": 2462 + }, + { + "epoch": 2.6063492063492064, + "grad_norm": 0.19144499037515725, + "learning_rate": 7.3108584868678955e-06, + "loss": 0.334, + "step": 2463 + }, + { + "epoch": 2.6074074074074076, + "grad_norm": 0.20970237193504582, + "learning_rate": 7.291258330066641e-06, + "loss": 0.3803, + "step": 2464 + }, + { + "epoch": 2.6084656084656084, + "grad_norm": 0.19369116393515454, + "learning_rate": 7.271658173265386e-06, + "loss": 0.2944, + "step": 2465 + }, + { + "epoch": 2.6095238095238096, + "grad_norm": 0.20464263215117728, + "learning_rate": 7.252058016464132e-06, + "loss": 0.3118, + "step": 2466 + }, + { + "epoch": 2.610582010582011, + "grad_norm": 0.19514819457072263, + "learning_rate": 7.232457859662878e-06, + "loss": 0.3373, + "step": 2467 + }, + { + "epoch": 2.6116402116402115, + "grad_norm": 0.2054577341252547, + "learning_rate": 7.212857702861624e-06, + "loss": 0.3312, + "step": 2468 + }, + { + "epoch": 2.6126984126984127, + "grad_norm": 0.218003406758833, + "learning_rate": 7.193257546060368e-06, + "loss": 0.3636, + "step": 2469 + }, + { + "epoch": 2.613756613756614, + "grad_norm": 0.44213851229624174, + "learning_rate": 7.173657389259114e-06, + "loss": 0.3976, + "step": 2470 + }, + { + "epoch": 2.6148148148148147, + "grad_norm": 0.2009534741403274, + "learning_rate": 7.15405723245786e-06, + "loss": 0.3442, + "step": 2471 + }, + { + "epoch": 2.615873015873016, + "grad_norm": 0.20912608516036513, + "learning_rate": 7.134457075656606e-06, + "loss": 0.3854, + "step": 2472 + }, + { + "epoch": 2.616931216931217, + "grad_norm": 0.20078773548602796, + "learning_rate": 7.114856918855352e-06, + "loss": 0.3793, + "step": 2473 + }, + { + "epoch": 2.617989417989418, + "grad_norm": 0.21374608990215083, + "learning_rate": 7.095256762054097e-06, + "loss": 0.3883, + "step": 2474 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 0.20326986212714018, + "learning_rate": 7.075656605252843e-06, + "loss": 0.3634, + "step": 2475 + }, + { + "epoch": 2.6201058201058203, + "grad_norm": 0.184483550265106, + "learning_rate": 7.0560564484515885e-06, + "loss": 0.3251, + "step": 2476 + }, + { + "epoch": 2.621164021164021, + "grad_norm": 0.20518627904667391, + "learning_rate": 7.036456291650334e-06, + "loss": 0.3383, + "step": 2477 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.2213975218224379, + "learning_rate": 7.0168561348490785e-06, + "loss": 0.3805, + "step": 2478 + }, + { + "epoch": 2.6232804232804234, + "grad_norm": 0.20669880976906882, + "learning_rate": 6.997255978047824e-06, + "loss": 0.3293, + "step": 2479 + }, + { + "epoch": 2.624338624338624, + "grad_norm": 0.20421330013118466, + "learning_rate": 6.97765582124657e-06, + "loss": 0.3308, + "step": 2480 + }, + { + "epoch": 2.6253968253968254, + "grad_norm": 0.23282007125318938, + "learning_rate": 6.958055664445316e-06, + "loss": 0.3844, + "step": 2481 + }, + { + "epoch": 2.6264550264550266, + "grad_norm": 0.20766235706851663, + "learning_rate": 6.938455507644062e-06, + "loss": 0.3729, + "step": 2482 + }, + { + "epoch": 2.6275132275132274, + "grad_norm": 0.20961331760975993, + "learning_rate": 6.918855350842807e-06, + "loss": 0.3733, + "step": 2483 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.20224319032663668, + "learning_rate": 6.899255194041553e-06, + "loss": 0.3737, + "step": 2484 + }, + { + "epoch": 2.6296296296296298, + "grad_norm": 0.17940459522018293, + "learning_rate": 6.879655037240299e-06, + "loss": 0.3172, + "step": 2485 + }, + { + "epoch": 2.6306878306878305, + "grad_norm": 0.18842546291187015, + "learning_rate": 6.860054880439045e-06, + "loss": 0.3237, + "step": 2486 + }, + { + "epoch": 2.6317460317460317, + "grad_norm": 0.20586867202259435, + "learning_rate": 6.840454723637789e-06, + "loss": 0.3341, + "step": 2487 + }, + { + "epoch": 2.632804232804233, + "grad_norm": 0.19471736282988128, + "learning_rate": 6.820854566836535e-06, + "loss": 0.3179, + "step": 2488 + }, + { + "epoch": 2.6338624338624337, + "grad_norm": 0.20284155346377497, + "learning_rate": 6.801254410035281e-06, + "loss": 0.3665, + "step": 2489 + }, + { + "epoch": 2.634920634920635, + "grad_norm": 0.18879946930455782, + "learning_rate": 6.7816542532340265e-06, + "loss": 0.3259, + "step": 2490 + }, + { + "epoch": 2.635978835978836, + "grad_norm": 0.19398530466899666, + "learning_rate": 6.762054096432771e-06, + "loss": 0.3179, + "step": 2491 + }, + { + "epoch": 2.637037037037037, + "grad_norm": 0.19234318749673635, + "learning_rate": 6.742453939631517e-06, + "loss": 0.3446, + "step": 2492 + }, + { + "epoch": 2.638095238095238, + "grad_norm": 0.21354926873262906, + "learning_rate": 6.722853782830263e-06, + "loss": 0.3525, + "step": 2493 + }, + { + "epoch": 2.6391534391534393, + "grad_norm": 0.20334093193943195, + "learning_rate": 6.703253626029009e-06, + "loss": 0.36, + "step": 2494 + }, + { + "epoch": 2.64021164021164, + "grad_norm": 0.2035547977347391, + "learning_rate": 6.683653469227755e-06, + "loss": 0.3542, + "step": 2495 + }, + { + "epoch": 2.641269841269841, + "grad_norm": 0.19627021534313335, + "learning_rate": 6.664053312426499e-06, + "loss": 0.334, + "step": 2496 + }, + { + "epoch": 2.6423280423280424, + "grad_norm": 0.2086234488088156, + "learning_rate": 6.644453155625245e-06, + "loss": 0.3619, + "step": 2497 + }, + { + "epoch": 2.643386243386243, + "grad_norm": 0.2201861596648691, + "learning_rate": 6.624852998823991e-06, + "loss": 0.389, + "step": 2498 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 0.20846191600277475, + "learning_rate": 6.605252842022737e-06, + "loss": 0.3779, + "step": 2499 + }, + { + "epoch": 2.6455026455026456, + "grad_norm": 0.20734012321675435, + "learning_rate": 6.585652685221482e-06, + "loss": 0.3508, + "step": 2500 + }, + { + "epoch": 2.6465608465608463, + "grad_norm": 0.19345863392721405, + "learning_rate": 6.566052528420228e-06, + "loss": 0.3295, + "step": 2501 + }, + { + "epoch": 2.6476190476190475, + "grad_norm": 0.20354235487098435, + "learning_rate": 6.546452371618974e-06, + "loss": 0.376, + "step": 2502 + }, + { + "epoch": 2.6486772486772487, + "grad_norm": 0.19936131307592084, + "learning_rate": 6.5268522148177195e-06, + "loss": 0.3372, + "step": 2503 + }, + { + "epoch": 2.6497354497354495, + "grad_norm": 0.19833040616276873, + "learning_rate": 6.507252058016465e-06, + "loss": 0.3249, + "step": 2504 + }, + { + "epoch": 2.6507936507936507, + "grad_norm": 0.22382325734125558, + "learning_rate": 6.4876519012152095e-06, + "loss": 0.3898, + "step": 2505 + }, + { + "epoch": 2.651851851851852, + "grad_norm": 0.2086391866302971, + "learning_rate": 6.468051744413955e-06, + "loss": 0.3822, + "step": 2506 + }, + { + "epoch": 2.6529100529100527, + "grad_norm": 0.19118652570324526, + "learning_rate": 6.448451587612701e-06, + "loss": 0.3232, + "step": 2507 + }, + { + "epoch": 2.653968253968254, + "grad_norm": 0.1992916481134372, + "learning_rate": 6.428851430811447e-06, + "loss": 0.3596, + "step": 2508 + }, + { + "epoch": 2.655026455026455, + "grad_norm": 0.20052752929381737, + "learning_rate": 6.409251274010192e-06, + "loss": 0.3474, + "step": 2509 + }, + { + "epoch": 2.656084656084656, + "grad_norm": 0.1892696179940764, + "learning_rate": 6.389651117208938e-06, + "loss": 0.3491, + "step": 2510 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.18821467544895704, + "learning_rate": 6.370050960407684e-06, + "loss": 0.3406, + "step": 2511 + }, + { + "epoch": 2.6582010582010582, + "grad_norm": 0.2146316899839704, + "learning_rate": 6.35045080360643e-06, + "loss": 0.3268, + "step": 2512 + }, + { + "epoch": 2.659259259259259, + "grad_norm": 0.1924428845095873, + "learning_rate": 6.330850646805176e-06, + "loss": 0.3278, + "step": 2513 + }, + { + "epoch": 2.66031746031746, + "grad_norm": 0.20147304345647865, + "learning_rate": 6.31125049000392e-06, + "loss": 0.3452, + "step": 2514 + }, + { + "epoch": 2.6613756613756614, + "grad_norm": 0.18452957282061885, + "learning_rate": 6.291650333202666e-06, + "loss": 0.3281, + "step": 2515 + }, + { + "epoch": 2.6624338624338626, + "grad_norm": 0.1857986422958092, + "learning_rate": 6.272050176401412e-06, + "loss": 0.3101, + "step": 2516 + }, + { + "epoch": 2.6634920634920634, + "grad_norm": 0.21536712844728778, + "learning_rate": 6.2524500196001575e-06, + "loss": 0.3835, + "step": 2517 + }, + { + "epoch": 2.6645502645502646, + "grad_norm": 0.19196672833526432, + "learning_rate": 6.2328498627989025e-06, + "loss": 0.331, + "step": 2518 + }, + { + "epoch": 2.6656084656084658, + "grad_norm": 0.20346805393197445, + "learning_rate": 6.213249705997648e-06, + "loss": 0.3718, + "step": 2519 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.19865890016978405, + "learning_rate": 6.193649549196394e-06, + "loss": 0.3476, + "step": 2520 + }, + { + "epoch": 2.6677248677248677, + "grad_norm": 0.1868360318396176, + "learning_rate": 6.17404939239514e-06, + "loss": 0.3461, + "step": 2521 + }, + { + "epoch": 2.668783068783069, + "grad_norm": 0.18283098776282525, + "learning_rate": 6.154449235593885e-06, + "loss": 0.3185, + "step": 2522 + }, + { + "epoch": 2.6698412698412697, + "grad_norm": 0.19611612634805634, + "learning_rate": 6.134849078792631e-06, + "loss": 0.3348, + "step": 2523 + }, + { + "epoch": 2.670899470899471, + "grad_norm": 0.19050047780185728, + "learning_rate": 6.115248921991376e-06, + "loss": 0.3277, + "step": 2524 + }, + { + "epoch": 2.671957671957672, + "grad_norm": 0.2084510520678467, + "learning_rate": 6.095648765190122e-06, + "loss": 0.3225, + "step": 2525 + }, + { + "epoch": 2.6730158730158733, + "grad_norm": 0.1922454758562699, + "learning_rate": 6.076048608388867e-06, + "loss": 0.3602, + "step": 2526 + }, + { + "epoch": 2.674074074074074, + "grad_norm": 0.19098409079480982, + "learning_rate": 6.056448451587613e-06, + "loss": 0.3501, + "step": 2527 + }, + { + "epoch": 2.6751322751322753, + "grad_norm": 0.19240132190670328, + "learning_rate": 6.036848294786359e-06, + "loss": 0.3455, + "step": 2528 + }, + { + "epoch": 2.6761904761904765, + "grad_norm": 0.18234297191193383, + "learning_rate": 6.017248137985105e-06, + "loss": 0.3248, + "step": 2529 + }, + { + "epoch": 2.677248677248677, + "grad_norm": 0.18076554021932248, + "learning_rate": 5.99764798118385e-06, + "loss": 0.3065, + "step": 2530 + }, + { + "epoch": 2.6783068783068784, + "grad_norm": 0.1929979054056131, + "learning_rate": 5.9780478243825955e-06, + "loss": 0.3604, + "step": 2531 + }, + { + "epoch": 2.6793650793650796, + "grad_norm": 0.20221424529505985, + "learning_rate": 5.958447667581341e-06, + "loss": 0.3857, + "step": 2532 + }, + { + "epoch": 2.6804232804232804, + "grad_norm": 0.20020493436720654, + "learning_rate": 5.938847510780086e-06, + "loss": 0.3353, + "step": 2533 + }, + { + "epoch": 2.6814814814814816, + "grad_norm": 0.1979878253122416, + "learning_rate": 5.919247353978832e-06, + "loss": 0.3524, + "step": 2534 + }, + { + "epoch": 2.682539682539683, + "grad_norm": 0.1878994433039551, + "learning_rate": 5.899647197177577e-06, + "loss": 0.3234, + "step": 2535 + }, + { + "epoch": 2.6835978835978835, + "grad_norm": 0.20303922395745672, + "learning_rate": 5.880047040376323e-06, + "loss": 0.3472, + "step": 2536 + }, + { + "epoch": 2.6846560846560847, + "grad_norm": 0.1964831514888397, + "learning_rate": 5.860446883575069e-06, + "loss": 0.3512, + "step": 2537 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.20050158418913078, + "learning_rate": 5.840846726773815e-06, + "loss": 0.368, + "step": 2538 + }, + { + "epoch": 2.6867724867724867, + "grad_norm": 0.19809108859289698, + "learning_rate": 5.82124656997256e-06, + "loss": 0.3642, + "step": 2539 + }, + { + "epoch": 2.687830687830688, + "grad_norm": 0.19542084982885585, + "learning_rate": 5.801646413171306e-06, + "loss": 0.3505, + "step": 2540 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 0.25026144659287575, + "learning_rate": 5.782046256370052e-06, + "loss": 0.3564, + "step": 2541 + }, + { + "epoch": 2.68994708994709, + "grad_norm": 0.19368060067900453, + "learning_rate": 5.762446099568797e-06, + "loss": 0.3444, + "step": 2542 + }, + { + "epoch": 2.691005291005291, + "grad_norm": 0.2015994007513886, + "learning_rate": 5.742845942767543e-06, + "loss": 0.3331, + "step": 2543 + }, + { + "epoch": 2.6920634920634923, + "grad_norm": 0.20202195926051253, + "learning_rate": 5.723245785966288e-06, + "loss": 0.3452, + "step": 2544 + }, + { + "epoch": 2.693121693121693, + "grad_norm": 0.18149608421071714, + "learning_rate": 5.7036456291650335e-06, + "loss": 0.2989, + "step": 2545 + }, + { + "epoch": 2.6941798941798942, + "grad_norm": 0.21151146883978197, + "learning_rate": 5.684045472363779e-06, + "loss": 0.3899, + "step": 2546 + }, + { + "epoch": 2.6952380952380954, + "grad_norm": 0.19106657712419947, + "learning_rate": 5.664445315562525e-06, + "loss": 0.3133, + "step": 2547 + }, + { + "epoch": 2.696296296296296, + "grad_norm": 0.204486097348059, + "learning_rate": 5.64484515876127e-06, + "loss": 0.3243, + "step": 2548 + }, + { + "epoch": 2.6973544973544974, + "grad_norm": 0.2120307771160055, + "learning_rate": 5.625245001960016e-06, + "loss": 0.3616, + "step": 2549 + }, + { + "epoch": 2.6984126984126986, + "grad_norm": 0.17095776864486206, + "learning_rate": 5.605644845158761e-06, + "loss": 0.2899, + "step": 2550 + }, + { + "epoch": 2.6994708994708994, + "grad_norm": 0.21843454968847806, + "learning_rate": 5.586044688357507e-06, + "loss": 0.3463, + "step": 2551 + }, + { + "epoch": 2.7005291005291006, + "grad_norm": 0.19118514569525785, + "learning_rate": 5.566444531556253e-06, + "loss": 0.3216, + "step": 2552 + }, + { + "epoch": 2.7015873015873018, + "grad_norm": 0.25238654920190934, + "learning_rate": 5.546844374754998e-06, + "loss": 0.3795, + "step": 2553 + }, + { + "epoch": 2.7026455026455025, + "grad_norm": 0.20643432168257772, + "learning_rate": 5.527244217953744e-06, + "loss": 0.366, + "step": 2554 + }, + { + "epoch": 2.7037037037037037, + "grad_norm": 0.1951728012706216, + "learning_rate": 5.50764406115249e-06, + "loss": 0.3211, + "step": 2555 + }, + { + "epoch": 2.704761904761905, + "grad_norm": 0.19406462105051225, + "learning_rate": 5.488043904351236e-06, + "loss": 0.3226, + "step": 2556 + }, + { + "epoch": 2.7058201058201057, + "grad_norm": 0.19140790364896687, + "learning_rate": 5.468443747549981e-06, + "loss": 0.32, + "step": 2557 + }, + { + "epoch": 2.706878306878307, + "grad_norm": 0.19071360052522665, + "learning_rate": 5.4488435907487265e-06, + "loss": 0.3619, + "step": 2558 + }, + { + "epoch": 2.707936507936508, + "grad_norm": 0.2066029798654874, + "learning_rate": 5.4292434339474715e-06, + "loss": 0.37, + "step": 2559 + }, + { + "epoch": 2.708994708994709, + "grad_norm": 0.2000206802025288, + "learning_rate": 5.409643277146217e-06, + "loss": 0.3739, + "step": 2560 + }, + { + "epoch": 2.71005291005291, + "grad_norm": 0.1992451517241142, + "learning_rate": 5.390043120344963e-06, + "loss": 0.2976, + "step": 2561 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.21041064516580602, + "learning_rate": 5.370442963543708e-06, + "loss": 0.3372, + "step": 2562 + }, + { + "epoch": 2.712169312169312, + "grad_norm": 0.1965382228229818, + "learning_rate": 5.350842806742454e-06, + "loss": 0.3181, + "step": 2563 + }, + { + "epoch": 2.713227513227513, + "grad_norm": 0.19553975072971144, + "learning_rate": 5.3312426499412e-06, + "loss": 0.3334, + "step": 2564 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.182293586725453, + "learning_rate": 5.311642493139946e-06, + "loss": 0.312, + "step": 2565 + }, + { + "epoch": 2.715343915343915, + "grad_norm": 0.22281231749006977, + "learning_rate": 5.292042336338691e-06, + "loss": 0.4265, + "step": 2566 + }, + { + "epoch": 2.7164021164021164, + "grad_norm": 0.19852030519721917, + "learning_rate": 5.272442179537437e-06, + "loss": 0.3736, + "step": 2567 + }, + { + "epoch": 2.7174603174603176, + "grad_norm": 0.20060353048459667, + "learning_rate": 5.252842022736182e-06, + "loss": 0.3608, + "step": 2568 + }, + { + "epoch": 2.7185185185185183, + "grad_norm": 0.2043124213401307, + "learning_rate": 5.233241865934928e-06, + "loss": 0.3695, + "step": 2569 + }, + { + "epoch": 2.7195767195767195, + "grad_norm": 0.21279444075120416, + "learning_rate": 5.213641709133673e-06, + "loss": 0.3461, + "step": 2570 + }, + { + "epoch": 2.7206349206349207, + "grad_norm": 0.1962353653907673, + "learning_rate": 5.194041552332419e-06, + "loss": 0.3479, + "step": 2571 + }, + { + "epoch": 2.7216931216931215, + "grad_norm": 0.19136776040919806, + "learning_rate": 5.1744413955311645e-06, + "loss": 0.3381, + "step": 2572 + }, + { + "epoch": 2.7227513227513227, + "grad_norm": 0.19168537594384008, + "learning_rate": 5.15484123872991e-06, + "loss": 0.332, + "step": 2573 + }, + { + "epoch": 2.723809523809524, + "grad_norm": 0.18822366734415621, + "learning_rate": 5.135241081928656e-06, + "loss": 0.3287, + "step": 2574 + }, + { + "epoch": 2.7248677248677247, + "grad_norm": 0.2077200076431067, + "learning_rate": 5.115640925127401e-06, + "loss": 0.3431, + "step": 2575 + }, + { + "epoch": 2.725925925925926, + "grad_norm": 0.19269703180103093, + "learning_rate": 5.096040768326147e-06, + "loss": 0.3086, + "step": 2576 + }, + { + "epoch": 2.726984126984127, + "grad_norm": 0.18860716399222538, + "learning_rate": 5.076440611524892e-06, + "loss": 0.3154, + "step": 2577 + }, + { + "epoch": 2.728042328042328, + "grad_norm": 0.1958899798494481, + "learning_rate": 5.056840454723638e-06, + "loss": 0.3111, + "step": 2578 + }, + { + "epoch": 2.729100529100529, + "grad_norm": 0.1820101679127598, + "learning_rate": 5.037240297922383e-06, + "loss": 0.3145, + "step": 2579 + }, + { + "epoch": 2.7301587301587302, + "grad_norm": 0.1983802921074873, + "learning_rate": 5.017640141121129e-06, + "loss": 0.3595, + "step": 2580 + }, + { + "epoch": 2.731216931216931, + "grad_norm": 0.19970794198008765, + "learning_rate": 4.998039984319875e-06, + "loss": 0.3572, + "step": 2581 + }, + { + "epoch": 2.732275132275132, + "grad_norm": 0.1893406404078852, + "learning_rate": 4.978439827518621e-06, + "loss": 0.3426, + "step": 2582 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.20404466163446328, + "learning_rate": 4.9588396707173666e-06, + "loss": 0.3658, + "step": 2583 + }, + { + "epoch": 2.734391534391534, + "grad_norm": 0.18412114768156596, + "learning_rate": 4.939239513916112e-06, + "loss": 0.3046, + "step": 2584 + }, + { + "epoch": 2.7354497354497354, + "grad_norm": 0.2075384994168872, + "learning_rate": 4.9196393571148575e-06, + "loss": 0.338, + "step": 2585 + }, + { + "epoch": 2.7365079365079366, + "grad_norm": 0.22412000158196252, + "learning_rate": 4.9000392003136025e-06, + "loss": 0.4155, + "step": 2586 + }, + { + "epoch": 2.7375661375661373, + "grad_norm": 0.20401992100764668, + "learning_rate": 4.880439043512348e-06, + "loss": 0.3865, + "step": 2587 + }, + { + "epoch": 2.7386243386243385, + "grad_norm": 0.18604920498095107, + "learning_rate": 4.860838886711093e-06, + "loss": 0.3134, + "step": 2588 + }, + { + "epoch": 2.7396825396825397, + "grad_norm": 0.21195152122045988, + "learning_rate": 4.841238729909839e-06, + "loss": 0.3833, + "step": 2589 + }, + { + "epoch": 2.7407407407407405, + "grad_norm": 0.19793521415832854, + "learning_rate": 4.821638573108585e-06, + "loss": 0.3654, + "step": 2590 + }, + { + "epoch": 2.7417989417989417, + "grad_norm": 0.19239288309471586, + "learning_rate": 4.802038416307331e-06, + "loss": 0.3311, + "step": 2591 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.18478342433128606, + "learning_rate": 4.782438259506077e-06, + "loss": 0.3196, + "step": 2592 + }, + { + "epoch": 2.7439153439153436, + "grad_norm": 0.19737300368776284, + "learning_rate": 4.762838102704822e-06, + "loss": 0.3399, + "step": 2593 + }, + { + "epoch": 2.744973544973545, + "grad_norm": 0.2044197261835309, + "learning_rate": 4.743237945903568e-06, + "loss": 0.3874, + "step": 2594 + }, + { + "epoch": 2.746031746031746, + "grad_norm": 0.2160010979562428, + "learning_rate": 4.723637789102313e-06, + "loss": 0.385, + "step": 2595 + }, + { + "epoch": 2.7470899470899472, + "grad_norm": 0.18127732926658155, + "learning_rate": 4.704037632301059e-06, + "loss": 0.3192, + "step": 2596 + }, + { + "epoch": 2.748148148148148, + "grad_norm": 0.18852738977755046, + "learning_rate": 4.684437475499804e-06, + "loss": 0.3439, + "step": 2597 + }, + { + "epoch": 2.749206349206349, + "grad_norm": 0.17640615561096873, + "learning_rate": 4.66483731869855e-06, + "loss": 0.2992, + "step": 2598 + }, + { + "epoch": 2.7502645502645504, + "grad_norm": 0.20969394677698452, + "learning_rate": 4.6452371618972955e-06, + "loss": 0.4039, + "step": 2599 + }, + { + "epoch": 2.751322751322751, + "grad_norm": 0.1938761616618117, + "learning_rate": 4.625637005096041e-06, + "loss": 0.3456, + "step": 2600 + }, + { + "epoch": 2.7523809523809524, + "grad_norm": 0.20065804261742967, + "learning_rate": 4.606036848294787e-06, + "loss": 0.379, + "step": 2601 + }, + { + "epoch": 2.7534391534391536, + "grad_norm": 0.18022036699122285, + "learning_rate": 4.586436691493532e-06, + "loss": 0.2838, + "step": 2602 + }, + { + "epoch": 2.7544973544973543, + "grad_norm": 0.20021693200514237, + "learning_rate": 4.566836534692278e-06, + "loss": 0.3744, + "step": 2603 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 0.19141878650651184, + "learning_rate": 4.547236377891023e-06, + "loss": 0.2937, + "step": 2604 + }, + { + "epoch": 2.7566137566137567, + "grad_norm": 0.2529365210436403, + "learning_rate": 4.527636221089769e-06, + "loss": 0.3638, + "step": 2605 + }, + { + "epoch": 2.757671957671958, + "grad_norm": 0.1854468401388474, + "learning_rate": 4.508036064288514e-06, + "loss": 0.3123, + "step": 2606 + }, + { + "epoch": 2.7587301587301587, + "grad_norm": 0.20565208226029344, + "learning_rate": 4.48843590748726e-06, + "loss": 0.3706, + "step": 2607 + }, + { + "epoch": 2.75978835978836, + "grad_norm": 0.19044667922854414, + "learning_rate": 4.468835750686006e-06, + "loss": 0.3196, + "step": 2608 + }, + { + "epoch": 2.760846560846561, + "grad_norm": 0.19238853879054066, + "learning_rate": 4.449235593884752e-06, + "loss": 0.3148, + "step": 2609 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.1956487628537959, + "learning_rate": 4.4296354370834976e-06, + "loss": 0.3521, + "step": 2610 + }, + { + "epoch": 2.762962962962963, + "grad_norm": 0.20209286152850098, + "learning_rate": 4.410035280282243e-06, + "loss": 0.3717, + "step": 2611 + }, + { + "epoch": 2.7640211640211643, + "grad_norm": 0.19340479563705504, + "learning_rate": 4.3904351234809885e-06, + "loss": 0.3598, + "step": 2612 + }, + { + "epoch": 2.765079365079365, + "grad_norm": 0.19526534766192782, + "learning_rate": 4.3708349666797335e-06, + "loss": 0.3709, + "step": 2613 + }, + { + "epoch": 2.7661375661375662, + "grad_norm": 0.1876434539839283, + "learning_rate": 4.351234809878479e-06, + "loss": 0.3356, + "step": 2614 + }, + { + "epoch": 2.7671957671957674, + "grad_norm": 0.18913882028697904, + "learning_rate": 4.331634653077224e-06, + "loss": 0.3331, + "step": 2615 + }, + { + "epoch": 2.768253968253968, + "grad_norm": 0.18531612005736103, + "learning_rate": 4.31203449627597e-06, + "loss": 0.3121, + "step": 2616 + }, + { + "epoch": 2.7693121693121694, + "grad_norm": 0.18210121950128233, + "learning_rate": 4.292434339474716e-06, + "loss": 0.3217, + "step": 2617 + }, + { + "epoch": 2.7703703703703706, + "grad_norm": 0.17631791372733657, + "learning_rate": 4.272834182673462e-06, + "loss": 0.3145, + "step": 2618 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.2052048552010276, + "learning_rate": 4.253234025872207e-06, + "loss": 0.3445, + "step": 2619 + }, + { + "epoch": 2.7724867724867726, + "grad_norm": 0.20391385871973938, + "learning_rate": 4.233633869070953e-06, + "loss": 0.3821, + "step": 2620 + }, + { + "epoch": 2.7735449735449738, + "grad_norm": 0.19379016138412378, + "learning_rate": 4.214033712269699e-06, + "loss": 0.3808, + "step": 2621 + }, + { + "epoch": 2.7746031746031745, + "grad_norm": 0.18659539827425273, + "learning_rate": 4.194433555468444e-06, + "loss": 0.3072, + "step": 2622 + }, + { + "epoch": 2.7756613756613757, + "grad_norm": 0.1894212818459714, + "learning_rate": 4.17483339866719e-06, + "loss": 0.3268, + "step": 2623 + }, + { + "epoch": 2.776719576719577, + "grad_norm": 0.190978704730525, + "learning_rate": 4.155233241865935e-06, + "loss": 0.3382, + "step": 2624 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.20109476739966778, + "learning_rate": 4.135633085064681e-06, + "loss": 0.3827, + "step": 2625 + }, + { + "epoch": 2.778835978835979, + "grad_norm": 0.19363672859817238, + "learning_rate": 4.1160329282634265e-06, + "loss": 0.3587, + "step": 2626 + }, + { + "epoch": 2.77989417989418, + "grad_norm": 0.2010695366446098, + "learning_rate": 4.096432771462172e-06, + "loss": 0.3657, + "step": 2627 + }, + { + "epoch": 2.780952380952381, + "grad_norm": 0.1910110293287592, + "learning_rate": 4.076832614660917e-06, + "loss": 0.3155, + "step": 2628 + }, + { + "epoch": 2.782010582010582, + "grad_norm": 0.19681762572714664, + "learning_rate": 4.057232457859663e-06, + "loss": 0.3781, + "step": 2629 + }, + { + "epoch": 2.7830687830687832, + "grad_norm": 0.2038735269817466, + "learning_rate": 4.037632301058409e-06, + "loss": 0.3807, + "step": 2630 + }, + { + "epoch": 2.784126984126984, + "grad_norm": 0.19777937024940856, + "learning_rate": 4.018032144257154e-06, + "loss": 0.3569, + "step": 2631 + }, + { + "epoch": 2.785185185185185, + "grad_norm": 0.1995201890239187, + "learning_rate": 3.9984319874559e-06, + "loss": 0.3701, + "step": 2632 + }, + { + "epoch": 2.7862433862433864, + "grad_norm": 0.20317171069691647, + "learning_rate": 3.978831830654645e-06, + "loss": 0.3569, + "step": 2633 + }, + { + "epoch": 2.787301587301587, + "grad_norm": 0.19068734602892104, + "learning_rate": 3.959231673853391e-06, + "loss": 0.3366, + "step": 2634 + }, + { + "epoch": 2.7883597883597884, + "grad_norm": 0.1828626835621772, + "learning_rate": 3.939631517052137e-06, + "loss": 0.3116, + "step": 2635 + }, + { + "epoch": 2.7894179894179896, + "grad_norm": 0.19548905750250636, + "learning_rate": 3.920031360250883e-06, + "loss": 0.3472, + "step": 2636 + }, + { + "epoch": 2.7904761904761903, + "grad_norm": 0.19023591210044766, + "learning_rate": 3.900431203449628e-06, + "loss": 0.3046, + "step": 2637 + }, + { + "epoch": 2.7915343915343915, + "grad_norm": 0.24034345295992002, + "learning_rate": 3.880831046648374e-06, + "loss": 0.402, + "step": 2638 + }, + { + "epoch": 2.7925925925925927, + "grad_norm": 0.1934131985141875, + "learning_rate": 3.861230889847119e-06, + "loss": 0.3431, + "step": 2639 + }, + { + "epoch": 2.7936507936507935, + "grad_norm": 0.19510076751913827, + "learning_rate": 3.8416307330458645e-06, + "loss": 0.3473, + "step": 2640 + }, + { + "epoch": 2.7947089947089947, + "grad_norm": 0.20150653464491855, + "learning_rate": 3.82203057624461e-06, + "loss": 0.3688, + "step": 2641 + }, + { + "epoch": 2.795767195767196, + "grad_norm": 0.18941533891094486, + "learning_rate": 3.802430419443356e-06, + "loss": 0.3482, + "step": 2642 + }, + { + "epoch": 2.7968253968253967, + "grad_norm": 0.20133005930916215, + "learning_rate": 3.7828302626421017e-06, + "loss": 0.3549, + "step": 2643 + }, + { + "epoch": 2.797883597883598, + "grad_norm": 0.20525961667412126, + "learning_rate": 3.7632301058408467e-06, + "loss": 0.3616, + "step": 2644 + }, + { + "epoch": 2.798941798941799, + "grad_norm": 0.19156388319779546, + "learning_rate": 3.7436299490395926e-06, + "loss": 0.3417, + "step": 2645 + }, + { + "epoch": 2.8, + "grad_norm": 0.2011256721429364, + "learning_rate": 3.724029792238338e-06, + "loss": 0.3601, + "step": 2646 + }, + { + "epoch": 2.801058201058201, + "grad_norm": 0.19619574219995886, + "learning_rate": 3.704429635437084e-06, + "loss": 0.3485, + "step": 2647 + }, + { + "epoch": 2.8021164021164022, + "grad_norm": 0.19499163232482158, + "learning_rate": 3.684829478635829e-06, + "loss": 0.3725, + "step": 2648 + }, + { + "epoch": 2.803174603174603, + "grad_norm": 0.19294474196059933, + "learning_rate": 3.665229321834575e-06, + "loss": 0.3555, + "step": 2649 + }, + { + "epoch": 2.804232804232804, + "grad_norm": 0.2032516666422444, + "learning_rate": 3.6456291650333207e-06, + "loss": 0.3661, + "step": 2650 + }, + { + "epoch": 2.8052910052910054, + "grad_norm": 0.2020734831226216, + "learning_rate": 3.626029008232066e-06, + "loss": 0.3556, + "step": 2651 + }, + { + "epoch": 2.806349206349206, + "grad_norm": 0.1916619404390491, + "learning_rate": 3.606428851430812e-06, + "loss": 0.3329, + "step": 2652 + }, + { + "epoch": 2.8074074074074074, + "grad_norm": 0.1952962673766634, + "learning_rate": 3.586828694629557e-06, + "loss": 0.3554, + "step": 2653 + }, + { + "epoch": 2.8084656084656086, + "grad_norm": 0.1962712621825182, + "learning_rate": 3.567228537828303e-06, + "loss": 0.3574, + "step": 2654 + }, + { + "epoch": 2.8095238095238093, + "grad_norm": 0.1844963945964023, + "learning_rate": 3.5476283810270484e-06, + "loss": 0.3231, + "step": 2655 + }, + { + "epoch": 2.8105820105820105, + "grad_norm": 0.18348101787536167, + "learning_rate": 3.5280282242257942e-06, + "loss": 0.3159, + "step": 2656 + }, + { + "epoch": 2.8116402116402117, + "grad_norm": 0.1849348467642649, + "learning_rate": 3.5084280674245393e-06, + "loss": 0.3231, + "step": 2657 + }, + { + "epoch": 2.8126984126984125, + "grad_norm": 0.18599313201602283, + "learning_rate": 3.488827910623285e-06, + "loss": 0.3381, + "step": 2658 + }, + { + "epoch": 2.8137566137566137, + "grad_norm": 0.18221416638152477, + "learning_rate": 3.469227753822031e-06, + "loss": 0.3136, + "step": 2659 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 0.18255639179737698, + "learning_rate": 3.4496275970207765e-06, + "loss": 0.322, + "step": 2660 + }, + { + "epoch": 2.8158730158730156, + "grad_norm": 0.2095837906111075, + "learning_rate": 3.4300274402195223e-06, + "loss": 0.3377, + "step": 2661 + }, + { + "epoch": 2.816931216931217, + "grad_norm": 0.18665268042560523, + "learning_rate": 3.4104272834182674e-06, + "loss": 0.3431, + "step": 2662 + }, + { + "epoch": 2.817989417989418, + "grad_norm": 0.19002223272665303, + "learning_rate": 3.3908271266170132e-06, + "loss": 0.3342, + "step": 2663 + }, + { + "epoch": 2.819047619047619, + "grad_norm": 0.18705897801580285, + "learning_rate": 3.3712269698157587e-06, + "loss": 0.3181, + "step": 2664 + }, + { + "epoch": 2.82010582010582, + "grad_norm": 0.20862688975705004, + "learning_rate": 3.3516268130145046e-06, + "loss": 0.3668, + "step": 2665 + }, + { + "epoch": 2.821164021164021, + "grad_norm": 0.19225010946678575, + "learning_rate": 3.3320266562132496e-06, + "loss": 0.3359, + "step": 2666 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 0.20161732029347787, + "learning_rate": 3.3124264994119955e-06, + "loss": 0.3367, + "step": 2667 + }, + { + "epoch": 2.823280423280423, + "grad_norm": 0.18828704734970103, + "learning_rate": 3.292826342610741e-06, + "loss": 0.3094, + "step": 2668 + }, + { + "epoch": 2.8243386243386244, + "grad_norm": 0.18924625950342489, + "learning_rate": 3.273226185809487e-06, + "loss": 0.3434, + "step": 2669 + }, + { + "epoch": 2.825396825396825, + "grad_norm": 0.18417340787076836, + "learning_rate": 3.2536260290082327e-06, + "loss": 0.3303, + "step": 2670 + }, + { + "epoch": 2.8264550264550263, + "grad_norm": 0.20379631500237214, + "learning_rate": 3.2340258722069777e-06, + "loss": 0.3635, + "step": 2671 + }, + { + "epoch": 2.8275132275132275, + "grad_norm": 0.19838887354654922, + "learning_rate": 3.2144257154057236e-06, + "loss": 0.3639, + "step": 2672 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 0.19794810913649097, + "learning_rate": 3.194825558604469e-06, + "loss": 0.3438, + "step": 2673 + }, + { + "epoch": 2.8296296296296295, + "grad_norm": 0.185510630825901, + "learning_rate": 3.175225401803215e-06, + "loss": 0.3467, + "step": 2674 + }, + { + "epoch": 2.8306878306878307, + "grad_norm": 0.192065774599022, + "learning_rate": 3.15562524500196e-06, + "loss": 0.3375, + "step": 2675 + }, + { + "epoch": 2.831746031746032, + "grad_norm": 0.19435756312714894, + "learning_rate": 3.136025088200706e-06, + "loss": 0.3493, + "step": 2676 + }, + { + "epoch": 2.8328042328042327, + "grad_norm": 0.2009853526245991, + "learning_rate": 3.1164249313994513e-06, + "loss": 0.3707, + "step": 2677 + }, + { + "epoch": 2.833862433862434, + "grad_norm": 0.1744316971987279, + "learning_rate": 3.096824774598197e-06, + "loss": 0.2686, + "step": 2678 + }, + { + "epoch": 2.834920634920635, + "grad_norm": 0.19758049236544187, + "learning_rate": 3.0772246177969426e-06, + "loss": 0.3461, + "step": 2679 + }, + { + "epoch": 2.835978835978836, + "grad_norm": 0.19599350952580494, + "learning_rate": 3.057624460995688e-06, + "loss": 0.3551, + "step": 2680 + }, + { + "epoch": 2.837037037037037, + "grad_norm": 0.18930967657748698, + "learning_rate": 3.0380243041944335e-06, + "loss": 0.3696, + "step": 2681 + }, + { + "epoch": 2.8380952380952382, + "grad_norm": 0.20202839534587627, + "learning_rate": 3.0184241473931794e-06, + "loss": 0.3899, + "step": 2682 + }, + { + "epoch": 2.8391534391534394, + "grad_norm": 0.1889193752419766, + "learning_rate": 2.998823990591925e-06, + "loss": 0.3159, + "step": 2683 + }, + { + "epoch": 2.84021164021164, + "grad_norm": 0.1911231246544652, + "learning_rate": 2.9792238337906707e-06, + "loss": 0.3272, + "step": 2684 + }, + { + "epoch": 2.8412698412698414, + "grad_norm": 0.20056244525036598, + "learning_rate": 2.959623676989416e-06, + "loss": 0.3516, + "step": 2685 + }, + { + "epoch": 2.8423280423280426, + "grad_norm": 0.19474804822394667, + "learning_rate": 2.9400235201881616e-06, + "loss": 0.3619, + "step": 2686 + }, + { + "epoch": 2.8433862433862434, + "grad_norm": 0.1922225375564539, + "learning_rate": 2.9204233633869075e-06, + "loss": 0.3583, + "step": 2687 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.2114409910111819, + "learning_rate": 2.900823206585653e-06, + "loss": 0.3864, + "step": 2688 + }, + { + "epoch": 2.8455026455026458, + "grad_norm": 0.18768776804906878, + "learning_rate": 2.8812230497843984e-06, + "loss": 0.3316, + "step": 2689 + }, + { + "epoch": 2.8465608465608465, + "grad_norm": 0.19630860306480566, + "learning_rate": 2.861622892983144e-06, + "loss": 0.3418, + "step": 2690 + }, + { + "epoch": 2.8476190476190477, + "grad_norm": 0.18278700681759866, + "learning_rate": 2.8420227361818897e-06, + "loss": 0.3305, + "step": 2691 + }, + { + "epoch": 2.848677248677249, + "grad_norm": 0.1828410519755495, + "learning_rate": 2.822422579380635e-06, + "loss": 0.3093, + "step": 2692 + }, + { + "epoch": 2.8497354497354497, + "grad_norm": 0.19191081583592245, + "learning_rate": 2.8028224225793806e-06, + "loss": 0.3178, + "step": 2693 + }, + { + "epoch": 2.850793650793651, + "grad_norm": 0.19137029288992774, + "learning_rate": 2.7832222657781265e-06, + "loss": 0.3494, + "step": 2694 + }, + { + "epoch": 2.851851851851852, + "grad_norm": 0.18943110594155188, + "learning_rate": 2.763622108976872e-06, + "loss": 0.346, + "step": 2695 + }, + { + "epoch": 2.852910052910053, + "grad_norm": 0.20066459093560637, + "learning_rate": 2.744021952175618e-06, + "loss": 0.3665, + "step": 2696 + }, + { + "epoch": 2.853968253968254, + "grad_norm": 0.20097973469627337, + "learning_rate": 2.7244217953743632e-06, + "loss": 0.3853, + "step": 2697 + }, + { + "epoch": 2.8550264550264552, + "grad_norm": 0.19008839219881032, + "learning_rate": 2.7048216385731087e-06, + "loss": 0.346, + "step": 2698 + }, + { + "epoch": 2.856084656084656, + "grad_norm": 0.17411703036920584, + "learning_rate": 2.685221481771854e-06, + "loss": 0.3064, + "step": 2699 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.19049701322996757, + "learning_rate": 2.6656213249706e-06, + "loss": 0.3472, + "step": 2700 + }, + { + "epoch": 2.8582010582010584, + "grad_norm": 0.17636675029171775, + "learning_rate": 2.6460211681693455e-06, + "loss": 0.3013, + "step": 2701 + }, + { + "epoch": 2.859259259259259, + "grad_norm": 0.18713094779664108, + "learning_rate": 2.626421011368091e-06, + "loss": 0.332, + "step": 2702 + }, + { + "epoch": 2.8603174603174604, + "grad_norm": 0.1840880400812757, + "learning_rate": 2.6068208545668364e-06, + "loss": 0.3173, + "step": 2703 + }, + { + "epoch": 2.8613756613756616, + "grad_norm": 0.18365151688406114, + "learning_rate": 2.5872206977655822e-06, + "loss": 0.3351, + "step": 2704 + }, + { + "epoch": 2.8624338624338623, + "grad_norm": 0.19835551815444752, + "learning_rate": 2.567620540964328e-06, + "loss": 0.3591, + "step": 2705 + }, + { + "epoch": 2.8634920634920635, + "grad_norm": 0.1821324213120818, + "learning_rate": 2.5480203841630736e-06, + "loss": 0.3202, + "step": 2706 + }, + { + "epoch": 2.8645502645502647, + "grad_norm": 0.18706640843666805, + "learning_rate": 2.528420227361819e-06, + "loss": 0.3202, + "step": 2707 + }, + { + "epoch": 2.8656084656084655, + "grad_norm": 0.1890910955252759, + "learning_rate": 2.5088200705605645e-06, + "loss": 0.3466, + "step": 2708 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.19715376592862416, + "learning_rate": 2.4892199137593104e-06, + "loss": 0.3587, + "step": 2709 + }, + { + "epoch": 2.867724867724868, + "grad_norm": 0.1929758625937477, + "learning_rate": 2.469619756958056e-06, + "loss": 0.3364, + "step": 2710 + }, + { + "epoch": 2.8687830687830687, + "grad_norm": 0.19274028108990232, + "learning_rate": 2.4500196001568013e-06, + "loss": 0.3349, + "step": 2711 + }, + { + "epoch": 2.86984126984127, + "grad_norm": 0.18700906272661122, + "learning_rate": 2.4304194433555467e-06, + "loss": 0.3221, + "step": 2712 + }, + { + "epoch": 2.870899470899471, + "grad_norm": 0.1815493059168304, + "learning_rate": 2.4108192865542926e-06, + "loss": 0.3283, + "step": 2713 + }, + { + "epoch": 2.871957671957672, + "grad_norm": 0.19775169552678987, + "learning_rate": 2.3912191297530385e-06, + "loss": 0.3233, + "step": 2714 + }, + { + "epoch": 2.873015873015873, + "grad_norm": 0.2659234152993213, + "learning_rate": 2.371618972951784e-06, + "loss": 0.3772, + "step": 2715 + }, + { + "epoch": 2.8740740740740742, + "grad_norm": 0.2176181455659064, + "learning_rate": 2.3520188161505294e-06, + "loss": 0.3911, + "step": 2716 + }, + { + "epoch": 2.875132275132275, + "grad_norm": 0.17472137810214414, + "learning_rate": 2.332418659349275e-06, + "loss": 0.2846, + "step": 2717 + }, + { + "epoch": 2.876190476190476, + "grad_norm": 0.19723914254783917, + "learning_rate": 2.3128185025480207e-06, + "loss": 0.3504, + "step": 2718 + }, + { + "epoch": 2.8772486772486774, + "grad_norm": 0.18663011406982238, + "learning_rate": 2.293218345746766e-06, + "loss": 0.3262, + "step": 2719 + }, + { + "epoch": 2.878306878306878, + "grad_norm": 0.19137413050629865, + "learning_rate": 2.2736181889455116e-06, + "loss": 0.3648, + "step": 2720 + }, + { + "epoch": 2.8793650793650793, + "grad_norm": 0.19471470387541595, + "learning_rate": 2.254018032144257e-06, + "loss": 0.3447, + "step": 2721 + }, + { + "epoch": 2.8804232804232806, + "grad_norm": 0.18568099963810475, + "learning_rate": 2.234417875343003e-06, + "loss": 0.3104, + "step": 2722 + }, + { + "epoch": 2.8814814814814813, + "grad_norm": 0.17907029988352577, + "learning_rate": 2.2148177185417488e-06, + "loss": 0.3088, + "step": 2723 + }, + { + "epoch": 2.8825396825396825, + "grad_norm": 0.18879588257488686, + "learning_rate": 2.1952175617404942e-06, + "loss": 0.362, + "step": 2724 + }, + { + "epoch": 2.8835978835978837, + "grad_norm": 0.17652030997033405, + "learning_rate": 2.1756174049392397e-06, + "loss": 0.3012, + "step": 2725 + }, + { + "epoch": 2.8846560846560845, + "grad_norm": 0.1763918711495294, + "learning_rate": 2.156017248137985e-06, + "loss": 0.2995, + "step": 2726 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 0.2041375473388221, + "learning_rate": 2.136417091336731e-06, + "loss": 0.3903, + "step": 2727 + }, + { + "epoch": 2.886772486772487, + "grad_norm": 0.1799297516998756, + "learning_rate": 2.1168169345354765e-06, + "loss": 0.3152, + "step": 2728 + }, + { + "epoch": 2.8878306878306876, + "grad_norm": 0.1995448891297022, + "learning_rate": 2.097216777734222e-06, + "loss": 0.3903, + "step": 2729 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.19215679672923563, + "learning_rate": 2.0776166209329674e-06, + "loss": 0.3628, + "step": 2730 + }, + { + "epoch": 2.88994708994709, + "grad_norm": 0.18610530187878963, + "learning_rate": 2.0580164641317132e-06, + "loss": 0.3251, + "step": 2731 + }, + { + "epoch": 2.891005291005291, + "grad_norm": 0.17806080573632393, + "learning_rate": 2.0384163073304587e-06, + "loss": 0.3056, + "step": 2732 + }, + { + "epoch": 2.892063492063492, + "grad_norm": 0.18795661792601714, + "learning_rate": 2.0188161505292046e-06, + "loss": 0.3396, + "step": 2733 + }, + { + "epoch": 2.893121693121693, + "grad_norm": 0.18063664445735603, + "learning_rate": 1.99921599372795e-06, + "loss": 0.2997, + "step": 2734 + }, + { + "epoch": 2.894179894179894, + "grad_norm": 0.18511841091215842, + "learning_rate": 1.9796158369266955e-06, + "loss": 0.3388, + "step": 2735 + }, + { + "epoch": 2.895238095238095, + "grad_norm": 0.2003527156439393, + "learning_rate": 1.9600156801254413e-06, + "loss": 0.3554, + "step": 2736 + }, + { + "epoch": 2.8962962962962964, + "grad_norm": 0.19842421312034014, + "learning_rate": 1.940415523324187e-06, + "loss": 0.3761, + "step": 2737 + }, + { + "epoch": 2.897354497354497, + "grad_norm": 0.2742847809137376, + "learning_rate": 1.9208153665229322e-06, + "loss": 0.374, + "step": 2738 + }, + { + "epoch": 2.8984126984126983, + "grad_norm": 0.19759325790955387, + "learning_rate": 1.901215209721678e-06, + "loss": 0.3458, + "step": 2739 + }, + { + "epoch": 2.8994708994708995, + "grad_norm": 0.1949299864756493, + "learning_rate": 1.8816150529204234e-06, + "loss": 0.3851, + "step": 2740 + }, + { + "epoch": 2.9005291005291003, + "grad_norm": 0.18716277246541932, + "learning_rate": 1.862014896119169e-06, + "loss": 0.3562, + "step": 2741 + }, + { + "epoch": 2.9015873015873015, + "grad_norm": 0.18868051576949735, + "learning_rate": 1.8424147393179145e-06, + "loss": 0.3559, + "step": 2742 + }, + { + "epoch": 2.9026455026455027, + "grad_norm": 0.1992670631408877, + "learning_rate": 1.8228145825166603e-06, + "loss": 0.3953, + "step": 2743 + }, + { + "epoch": 2.9037037037037035, + "grad_norm": 0.190011431296961, + "learning_rate": 1.803214425715406e-06, + "loss": 0.3517, + "step": 2744 + }, + { + "epoch": 2.9047619047619047, + "grad_norm": 0.1898685284634927, + "learning_rate": 1.7836142689141515e-06, + "loss": 0.324, + "step": 2745 + }, + { + "epoch": 2.905820105820106, + "grad_norm": 0.20798301264228833, + "learning_rate": 1.7640141121128971e-06, + "loss": 0.4022, + "step": 2746 + }, + { + "epoch": 2.9068783068783066, + "grad_norm": 0.1884618131179482, + "learning_rate": 1.7444139553116426e-06, + "loss": 0.3162, + "step": 2747 + }, + { + "epoch": 2.907936507936508, + "grad_norm": 0.2062849190831743, + "learning_rate": 1.7248137985103882e-06, + "loss": 0.3713, + "step": 2748 + }, + { + "epoch": 2.908994708994709, + "grad_norm": 0.21572426764959662, + "learning_rate": 1.7052136417091337e-06, + "loss": 0.3902, + "step": 2749 + }, + { + "epoch": 2.91005291005291, + "grad_norm": 0.29553314531980623, + "learning_rate": 1.6856134849078794e-06, + "loss": 0.3407, + "step": 2750 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 0.20393510044120217, + "learning_rate": 1.6660133281066248e-06, + "loss": 0.4114, + "step": 2751 + }, + { + "epoch": 2.912169312169312, + "grad_norm": 0.16933792741507034, + "learning_rate": 1.6464131713053705e-06, + "loss": 0.3066, + "step": 2752 + }, + { + "epoch": 2.9132275132275134, + "grad_norm": 0.1996783103654981, + "learning_rate": 1.6268130145041163e-06, + "loss": 0.3649, + "step": 2753 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.1967458515577118, + "learning_rate": 1.6072128577028618e-06, + "loss": 0.3568, + "step": 2754 + }, + { + "epoch": 2.9153439153439153, + "grad_norm": 0.18448247900585343, + "learning_rate": 1.5876127009016075e-06, + "loss": 0.326, + "step": 2755 + }, + { + "epoch": 2.9164021164021166, + "grad_norm": 0.19461118793102933, + "learning_rate": 1.568012544100353e-06, + "loss": 0.3213, + "step": 2756 + }, + { + "epoch": 2.9174603174603173, + "grad_norm": 0.1899035298993076, + "learning_rate": 1.5484123872990986e-06, + "loss": 0.3387, + "step": 2757 + }, + { + "epoch": 2.9185185185185185, + "grad_norm": 0.1973718574120133, + "learning_rate": 1.528812230497844e-06, + "loss": 0.3617, + "step": 2758 + }, + { + "epoch": 2.9195767195767197, + "grad_norm": 0.29546988416171227, + "learning_rate": 1.5092120736965897e-06, + "loss": 0.338, + "step": 2759 + }, + { + "epoch": 2.9206349206349205, + "grad_norm": 0.18641514359807654, + "learning_rate": 1.4896119168953353e-06, + "loss": 0.3353, + "step": 2760 + }, + { + "epoch": 2.9216931216931217, + "grad_norm": 0.1986244733946501, + "learning_rate": 1.4700117600940808e-06, + "loss": 0.3652, + "step": 2761 + }, + { + "epoch": 2.922751322751323, + "grad_norm": 0.1840275855494628, + "learning_rate": 1.4504116032928265e-06, + "loss": 0.34, + "step": 2762 + }, + { + "epoch": 2.923809523809524, + "grad_norm": 0.1955588620037554, + "learning_rate": 1.430811446491572e-06, + "loss": 0.3592, + "step": 2763 + }, + { + "epoch": 2.924867724867725, + "grad_norm": 0.19691689936476783, + "learning_rate": 1.4112112896903176e-06, + "loss": 0.377, + "step": 2764 + }, + { + "epoch": 2.925925925925926, + "grad_norm": 0.18009646768548268, + "learning_rate": 1.3916111328890632e-06, + "loss": 0.3205, + "step": 2765 + }, + { + "epoch": 2.9269841269841272, + "grad_norm": 0.1898019017223925, + "learning_rate": 1.372010976087809e-06, + "loss": 0.3549, + "step": 2766 + }, + { + "epoch": 2.928042328042328, + "grad_norm": 0.19704920070535306, + "learning_rate": 1.3524108192865543e-06, + "loss": 0.3775, + "step": 2767 + }, + { + "epoch": 2.929100529100529, + "grad_norm": 0.18705440894929945, + "learning_rate": 1.3328106624853e-06, + "loss": 0.3265, + "step": 2768 + }, + { + "epoch": 2.9301587301587304, + "grad_norm": 0.16473519945175205, + "learning_rate": 1.3132105056840455e-06, + "loss": 0.2888, + "step": 2769 + }, + { + "epoch": 2.931216931216931, + "grad_norm": 0.19655934519795515, + "learning_rate": 1.2936103488827911e-06, + "loss": 0.3502, + "step": 2770 + }, + { + "epoch": 2.9322751322751324, + "grad_norm": 0.19677250020354922, + "learning_rate": 1.2740101920815368e-06, + "loss": 0.377, + "step": 2771 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.1937023595690031, + "learning_rate": 1.2544100352802822e-06, + "loss": 0.3376, + "step": 2772 + }, + { + "epoch": 2.9343915343915343, + "grad_norm": 0.1884504777247503, + "learning_rate": 1.234809878479028e-06, + "loss": 0.3534, + "step": 2773 + }, + { + "epoch": 2.9354497354497355, + "grad_norm": 0.1807776339927901, + "learning_rate": 1.2152097216777734e-06, + "loss": 0.3001, + "step": 2774 + }, + { + "epoch": 2.9365079365079367, + "grad_norm": 0.19494893044097508, + "learning_rate": 1.1956095648765192e-06, + "loss": 0.3601, + "step": 2775 + }, + { + "epoch": 2.9375661375661375, + "grad_norm": 0.18173859837169326, + "learning_rate": 1.1760094080752647e-06, + "loss": 0.3351, + "step": 2776 + }, + { + "epoch": 2.9386243386243387, + "grad_norm": 0.19155272417454372, + "learning_rate": 1.1564092512740103e-06, + "loss": 0.3246, + "step": 2777 + }, + { + "epoch": 2.93968253968254, + "grad_norm": 0.18097278381295773, + "learning_rate": 1.1368090944727558e-06, + "loss": 0.3325, + "step": 2778 + }, + { + "epoch": 2.9407407407407407, + "grad_norm": 0.18795150930344018, + "learning_rate": 1.1172089376715015e-06, + "loss": 0.3289, + "step": 2779 + }, + { + "epoch": 2.941798941798942, + "grad_norm": 0.19915958751004778, + "learning_rate": 1.0976087808702471e-06, + "loss": 0.3626, + "step": 2780 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 0.21232105482687197, + "learning_rate": 1.0780086240689926e-06, + "loss": 0.4117, + "step": 2781 + }, + { + "epoch": 2.943915343915344, + "grad_norm": 0.1864031377940111, + "learning_rate": 1.0584084672677382e-06, + "loss": 0.3617, + "step": 2782 + }, + { + "epoch": 2.944973544973545, + "grad_norm": 0.20266505522326464, + "learning_rate": 1.0388083104664837e-06, + "loss": 0.4006, + "step": 2783 + }, + { + "epoch": 2.9460317460317462, + "grad_norm": 0.17805463329627502, + "learning_rate": 1.0192081536652293e-06, + "loss": 0.3185, + "step": 2784 + }, + { + "epoch": 2.947089947089947, + "grad_norm": 0.1865921965755665, + "learning_rate": 9.99607996863975e-07, + "loss": 0.3616, + "step": 2785 + }, + { + "epoch": 2.948148148148148, + "grad_norm": 0.19565848130960645, + "learning_rate": 9.800078400627207e-07, + "loss": 0.3565, + "step": 2786 + }, + { + "epoch": 2.9492063492063494, + "grad_norm": 0.19030143596622032, + "learning_rate": 9.604076832614661e-07, + "loss": 0.3379, + "step": 2787 + }, + { + "epoch": 2.95026455026455, + "grad_norm": 0.2078056379962277, + "learning_rate": 9.408075264602117e-07, + "loss": 0.3939, + "step": 2788 + }, + { + "epoch": 2.9513227513227513, + "grad_norm": 0.18813468584278845, + "learning_rate": 9.212073696589572e-07, + "loss": 0.3346, + "step": 2789 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.17678708921482536, + "learning_rate": 9.01607212857703e-07, + "loss": 0.3052, + "step": 2790 + }, + { + "epoch": 2.9534391534391533, + "grad_norm": 0.18575208703680052, + "learning_rate": 8.820070560564486e-07, + "loss": 0.323, + "step": 2791 + }, + { + "epoch": 2.9544973544973545, + "grad_norm": 0.18700053280637205, + "learning_rate": 8.624068992551941e-07, + "loss": 0.3312, + "step": 2792 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 0.1947341580545555, + "learning_rate": 8.428067424539397e-07, + "loss": 0.3652, + "step": 2793 + }, + { + "epoch": 2.9566137566137565, + "grad_norm": 0.18336157333209027, + "learning_rate": 8.232065856526852e-07, + "loss": 0.299, + "step": 2794 + }, + { + "epoch": 2.9576719576719577, + "grad_norm": 0.1989438129617965, + "learning_rate": 8.036064288514309e-07, + "loss": 0.3758, + "step": 2795 + }, + { + "epoch": 2.958730158730159, + "grad_norm": 0.1795010261425146, + "learning_rate": 7.840062720501765e-07, + "loss": 0.3095, + "step": 2796 + }, + { + "epoch": 2.9597883597883596, + "grad_norm": 0.1878025427152609, + "learning_rate": 7.64406115248922e-07, + "loss": 0.343, + "step": 2797 + }, + { + "epoch": 2.960846560846561, + "grad_norm": 0.1860910101745494, + "learning_rate": 7.448059584476677e-07, + "loss": 0.3434, + "step": 2798 + }, + { + "epoch": 2.961904761904762, + "grad_norm": 0.2080637929191924, + "learning_rate": 7.252058016464132e-07, + "loss": 0.3734, + "step": 2799 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.19340393517672708, + "learning_rate": 7.056056448451588e-07, + "loss": 0.3631, + "step": 2800 + }, + { + "epoch": 2.964021164021164, + "grad_norm": 0.20271654196591155, + "learning_rate": 6.860054880439044e-07, + "loss": 0.3958, + "step": 2801 + }, + { + "epoch": 2.965079365079365, + "grad_norm": 0.19696852611219293, + "learning_rate": 6.6640533124265e-07, + "loss": 0.3719, + "step": 2802 + }, + { + "epoch": 2.966137566137566, + "grad_norm": 0.1884693790216164, + "learning_rate": 6.468051744413956e-07, + "loss": 0.3243, + "step": 2803 + }, + { + "epoch": 2.967195767195767, + "grad_norm": 0.1875853128029582, + "learning_rate": 6.272050176401411e-07, + "loss": 0.3405, + "step": 2804 + }, + { + "epoch": 2.9682539682539684, + "grad_norm": 0.19330130887895539, + "learning_rate": 6.076048608388867e-07, + "loss": 0.3742, + "step": 2805 + }, + { + "epoch": 2.969312169312169, + "grad_norm": 0.2007548359643975, + "learning_rate": 5.880047040376323e-07, + "loss": 0.3831, + "step": 2806 + }, + { + "epoch": 2.9703703703703703, + "grad_norm": 0.18320658631267467, + "learning_rate": 5.684045472363779e-07, + "loss": 0.3455, + "step": 2807 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.19352908453997061, + "learning_rate": 5.488043904351236e-07, + "loss": 0.3337, + "step": 2808 + }, + { + "epoch": 2.9724867724867723, + "grad_norm": 0.18917045289367823, + "learning_rate": 5.292042336338691e-07, + "loss": 0.3668, + "step": 2809 + }, + { + "epoch": 2.9735449735449735, + "grad_norm": 0.28331421422089914, + "learning_rate": 5.096040768326147e-07, + "loss": 0.3877, + "step": 2810 + }, + { + "epoch": 2.9746031746031747, + "grad_norm": 0.18730859428046542, + "learning_rate": 4.900039200313603e-07, + "loss": 0.3235, + "step": 2811 + }, + { + "epoch": 2.9756613756613755, + "grad_norm": 0.2061919680406255, + "learning_rate": 4.7040376323010584e-07, + "loss": 0.3877, + "step": 2812 + }, + { + "epoch": 2.9767195767195767, + "grad_norm": 0.17991475264599735, + "learning_rate": 4.508036064288515e-07, + "loss": 0.3117, + "step": 2813 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.18979406930672754, + "learning_rate": 4.3120344962759706e-07, + "loss": 0.3523, + "step": 2814 + }, + { + "epoch": 2.9788359788359786, + "grad_norm": 0.20105316878907992, + "learning_rate": 4.116032928263426e-07, + "loss": 0.3547, + "step": 2815 + }, + { + "epoch": 2.97989417989418, + "grad_norm": 0.18428131124919112, + "learning_rate": 3.920031360250882e-07, + "loss": 0.323, + "step": 2816 + }, + { + "epoch": 2.980952380952381, + "grad_norm": 0.1812934812970745, + "learning_rate": 3.7240297922383384e-07, + "loss": 0.3181, + "step": 2817 + }, + { + "epoch": 2.982010582010582, + "grad_norm": 0.1863114001092859, + "learning_rate": 3.528028224225794e-07, + "loss": 0.3177, + "step": 2818 + }, + { + "epoch": 2.983068783068783, + "grad_norm": 0.18605121181705242, + "learning_rate": 3.33202665621325e-07, + "loss": 0.336, + "step": 2819 + }, + { + "epoch": 2.984126984126984, + "grad_norm": 0.20781903726047113, + "learning_rate": 3.1360250882007056e-07, + "loss": 0.4122, + "step": 2820 + }, + { + "epoch": 2.985185185185185, + "grad_norm": 0.2016418889379911, + "learning_rate": 2.9400235201881617e-07, + "loss": 0.3835, + "step": 2821 + }, + { + "epoch": 2.986243386243386, + "grad_norm": 0.17435892542662546, + "learning_rate": 2.744021952175618e-07, + "loss": 0.302, + "step": 2822 + }, + { + "epoch": 2.9873015873015873, + "grad_norm": 0.18917626577977661, + "learning_rate": 2.5480203841630734e-07, + "loss": 0.3654, + "step": 2823 + }, + { + "epoch": 2.988359788359788, + "grad_norm": 0.18622957441576268, + "learning_rate": 2.3520188161505292e-07, + "loss": 0.3388, + "step": 2824 + }, + { + "epoch": 2.9894179894179893, + "grad_norm": 0.1859174632852059, + "learning_rate": 2.1560172481379853e-07, + "loss": 0.3393, + "step": 2825 + }, + { + "epoch": 2.9904761904761905, + "grad_norm": 0.19065097631350006, + "learning_rate": 1.960015680125441e-07, + "loss": 0.3422, + "step": 2826 + }, + { + "epoch": 2.9915343915343913, + "grad_norm": 0.18272454868166982, + "learning_rate": 1.764014112112897e-07, + "loss": 0.3127, + "step": 2827 + }, + { + "epoch": 2.9925925925925925, + "grad_norm": 0.18725014042815322, + "learning_rate": 1.5680125441003528e-07, + "loss": 0.3275, + "step": 2828 + }, + { + "epoch": 2.9936507936507937, + "grad_norm": 0.17561038959172634, + "learning_rate": 1.372010976087809e-07, + "loss": 0.3066, + "step": 2829 + }, + { + "epoch": 2.9947089947089944, + "grad_norm": 0.18415272195204574, + "learning_rate": 1.1760094080752646e-07, + "loss": 0.3328, + "step": 2830 + }, + { + "epoch": 2.9957671957671956, + "grad_norm": 0.17845429273021535, + "learning_rate": 9.800078400627206e-08, + "loss": 0.2933, + "step": 2831 + }, + { + "epoch": 2.996825396825397, + "grad_norm": 0.1784414113949217, + "learning_rate": 7.840062720501764e-08, + "loss": 0.3129, + "step": 2832 + }, + { + "epoch": 2.997883597883598, + "grad_norm": 0.18326094077219415, + "learning_rate": 5.880047040376323e-08, + "loss": 0.339, + "step": 2833 + }, + { + "epoch": 2.998941798941799, + "grad_norm": 0.181202143145444, + "learning_rate": 3.920031360250882e-08, + "loss": 0.2975, + "step": 2834 + }, + { + "epoch": 3.0, + "grad_norm": 1.1791412063210063, + "learning_rate": 1.960015680125441e-08, + "loss": 0.3422, + "step": 2835 + }, + { + "epoch": 3.0, + "step": 2835, + "total_flos": 3.152810456905679e+19, + "train_loss": 0.5287227819941451, + "train_runtime": 91700.2784, + "train_samples_per_second": 0.495, + "train_steps_per_second": 0.031 + } + ], + "logging_steps": 1, + "max_steps": 2835, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.152810456905679e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}