| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 500, |
| "global_step": 536, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007511737089201878, |
| "grad_norm": 1.5546875, |
| "learning_rate": 5e-05, |
| "loss": 0.7059, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.015023474178403756, |
| "grad_norm": 2.046875, |
| "learning_rate": 4.996268656716418e-05, |
| "loss": 0.8614, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.022535211267605635, |
| "grad_norm": 14.375, |
| "learning_rate": 4.992537313432836e-05, |
| "loss": 0.799, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.03004694835680751, |
| "grad_norm": 2.421875, |
| "learning_rate": 4.988805970149254e-05, |
| "loss": 0.7599, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.03755868544600939, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.985074626865672e-05, |
| "loss": 0.7254, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04507042253521127, |
| "grad_norm": 1.4375, |
| "learning_rate": 4.98134328358209e-05, |
| "loss": 0.7982, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.05258215962441314, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.977611940298508e-05, |
| "loss": 0.7192, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.06009389671361502, |
| "grad_norm": 2.71875, |
| "learning_rate": 4.973880597014925e-05, |
| "loss": 0.8359, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0676056338028169, |
| "grad_norm": 1.4140625, |
| "learning_rate": 4.9701492537313436e-05, |
| "loss": 0.6135, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.07511737089201878, |
| "grad_norm": 3.96875, |
| "learning_rate": 4.966417910447762e-05, |
| "loss": 1.0456, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08262910798122065, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.9626865671641794e-05, |
| "loss": 0.6937, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.09014084507042254, |
| "grad_norm": 1.3828125, |
| "learning_rate": 4.9589552238805977e-05, |
| "loss": 0.6935, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.09765258215962441, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.955223880597015e-05, |
| "loss": 0.9582, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.10516431924882629, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.951492537313433e-05, |
| "loss": 0.6369, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.11267605633802817, |
| "grad_norm": 1.4453125, |
| "learning_rate": 4.9477611940298504e-05, |
| "loss": 0.8523, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12018779342723004, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.944029850746269e-05, |
| "loss": 0.7572, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.12769953051643193, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.940298507462687e-05, |
| "loss": 0.8729, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.1352112676056338, |
| "grad_norm": 1.4296875, |
| "learning_rate": 4.9365671641791045e-05, |
| "loss": 0.7657, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.14272300469483568, |
| "grad_norm": 1.5546875, |
| "learning_rate": 4.932835820895523e-05, |
| "loss": 0.7701, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.15023474178403756, |
| "grad_norm": 1.421875, |
| "learning_rate": 4.92910447761194e-05, |
| "loss": 0.8063, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.15774647887323945, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.9253731343283586e-05, |
| "loss": 0.6274, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.1652582159624413, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.921641791044777e-05, |
| "loss": 0.6862, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.1727699530516432, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.9179104477611944e-05, |
| "loss": 0.8265, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.18028169014084508, |
| "grad_norm": 1.3671875, |
| "learning_rate": 4.914179104477612e-05, |
| "loss": 0.6659, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.18779342723004694, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.91044776119403e-05, |
| "loss": 0.6795, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.19530516431924883, |
| "grad_norm": 3.734375, |
| "learning_rate": 4.906716417910448e-05, |
| "loss": 0.7914, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.2028169014084507, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.902985074626866e-05, |
| "loss": 0.7915, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.21032863849765257, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.899253731343284e-05, |
| "loss": 0.6433, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.21784037558685446, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.895522388059702e-05, |
| "loss": 0.8444, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.22535211267605634, |
| "grad_norm": 1.328125, |
| "learning_rate": 4.8917910447761195e-05, |
| "loss": 0.7353, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.23286384976525823, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.888059701492538e-05, |
| "loss": 0.6133, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.2403755868544601, |
| "grad_norm": 1.4375, |
| "learning_rate": 4.884328358208955e-05, |
| "loss": 0.671, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.24788732394366197, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.8805970149253735e-05, |
| "loss": 0.7712, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.25539906103286386, |
| "grad_norm": 1.4765625, |
| "learning_rate": 4.876865671641792e-05, |
| "loss": 0.8207, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.26291079812206575, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.8731343283582094e-05, |
| "loss": 0.7435, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.2704225352112676, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.869402985074627e-05, |
| "loss": 0.7014, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.27793427230046946, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.8656716417910445e-05, |
| "loss": 0.7946, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.28544600938967135, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.861940298507463e-05, |
| "loss": 0.5345, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.29295774647887324, |
| "grad_norm": 2.859375, |
| "learning_rate": 4.858208955223881e-05, |
| "loss": 0.8832, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.3004694835680751, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.8544776119402986e-05, |
| "loss": 0.8591, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.307981220657277, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.850746268656717e-05, |
| "loss": 0.6557, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.3154929577464789, |
| "grad_norm": 0.984375, |
| "learning_rate": 4.8470149253731344e-05, |
| "loss": 0.715, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.32300469483568073, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.843283582089552e-05, |
| "loss": 0.6608, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.3305164319248826, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.83955223880597e-05, |
| "loss": 0.5904, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.3380281690140845, |
| "grad_norm": 2.703125, |
| "learning_rate": 4.8358208955223885e-05, |
| "loss": 0.8645, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.3455399061032864, |
| "grad_norm": 5.84375, |
| "learning_rate": 4.832089552238806e-05, |
| "loss": 0.9412, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.3530516431924883, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.8283582089552244e-05, |
| "loss": 0.8767, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.36056338028169016, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.824626865671642e-05, |
| "loss": 0.8197, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.36807511737089205, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.8208955223880595e-05, |
| "loss": 0.6598, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.3755868544600939, |
| "grad_norm": 4.0, |
| "learning_rate": 4.817164179104478e-05, |
| "loss": 1.1535, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.38309859154929576, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.813432835820896e-05, |
| "loss": 0.6635, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.39061032863849765, |
| "grad_norm": 1.3125, |
| "learning_rate": 4.8097014925373136e-05, |
| "loss": 0.6979, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.39812206572769954, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.805970149253732e-05, |
| "loss": 0.7167, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.4056338028169014, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.8022388059701494e-05, |
| "loss": 0.8033, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.4131455399061033, |
| "grad_norm": 1.421875, |
| "learning_rate": 4.798507462686567e-05, |
| "loss": 0.8736, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.42065727699530514, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.794776119402985e-05, |
| "loss": 0.8361, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.428169014084507, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.7910447761194035e-05, |
| "loss": 0.6386, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.4356807511737089, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.787313432835821e-05, |
| "loss": 0.7318, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.4431924882629108, |
| "grad_norm": 1.4375, |
| "learning_rate": 4.7835820895522394e-05, |
| "loss": 0.7141, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.4507042253521127, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.779850746268657e-05, |
| "loss": 0.6612, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4582159624413146, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.7761194029850745e-05, |
| "loss": 0.8155, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.46572769953051646, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.772388059701493e-05, |
| "loss": 0.8123, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.4732394366197183, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.768656716417911e-05, |
| "loss": 0.6706, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.4807511737089202, |
| "grad_norm": 1.25, |
| "learning_rate": 4.7649253731343286e-05, |
| "loss": 0.8376, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.48826291079812206, |
| "grad_norm": 0.98828125, |
| "learning_rate": 4.761194029850746e-05, |
| "loss": 0.7736, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.49577464788732395, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.7574626865671644e-05, |
| "loss": 0.777, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.5032863849765258, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.753731343283582e-05, |
| "loss": 0.7517, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.5107981220657277, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.75e-05, |
| "loss": 0.8422, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.5183098591549296, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.7462686567164185e-05, |
| "loss": 0.6369, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.5258215962441315, |
| "grad_norm": 1.0, |
| "learning_rate": 4.742537313432836e-05, |
| "loss": 0.6823, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5333333333333333, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.738805970149254e-05, |
| "loss": 0.6799, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.5408450704225352, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.735074626865672e-05, |
| "loss": 0.6477, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.5483568075117371, |
| "grad_norm": 1.125, |
| "learning_rate": 4.7313432835820895e-05, |
| "loss": 0.7116, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.5558685446009389, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.727611940298508e-05, |
| "loss": 0.6501, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.5633802816901409, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.723880597014926e-05, |
| "loss": 0.8061, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5708920187793427, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.7201492537313436e-05, |
| "loss": 0.7363, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.5784037558685446, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.716417910447761e-05, |
| "loss": 0.7949, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.5859154929577465, |
| "grad_norm": 1.25, |
| "learning_rate": 4.7126865671641794e-05, |
| "loss": 0.7815, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.5934272300469483, |
| "grad_norm": 1.40625, |
| "learning_rate": 4.708955223880597e-05, |
| "loss": 0.7815, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.6009389671361502, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.705223880597015e-05, |
| "loss": 0.7299, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6084507042253521, |
| "grad_norm": 1.25, |
| "learning_rate": 4.7014925373134335e-05, |
| "loss": 0.5975, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.615962441314554, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.697761194029851e-05, |
| "loss": 0.8056, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.6234741784037559, |
| "grad_norm": 1.3671875, |
| "learning_rate": 4.6940298507462687e-05, |
| "loss": 0.8034, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.6309859154929578, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.690298507462687e-05, |
| "loss": 0.6472, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.6384976525821596, |
| "grad_norm": 1.328125, |
| "learning_rate": 4.6865671641791045e-05, |
| "loss": 0.8561, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.6460093896713615, |
| "grad_norm": 2.90625, |
| "learning_rate": 4.682835820895523e-05, |
| "loss": 0.8774, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.6535211267605634, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.67910447761194e-05, |
| "loss": 0.6865, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.6610328638497652, |
| "grad_norm": 1.53125, |
| "learning_rate": 4.6753731343283586e-05, |
| "loss": 0.798, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.6685446009389672, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.671641791044776e-05, |
| "loss": 0.7587, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.676056338028169, |
| "grad_norm": 1.25, |
| "learning_rate": 4.667910447761194e-05, |
| "loss": 0.6989, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6835680751173709, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.664179104477612e-05, |
| "loss": 0.639, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.6910798122065728, |
| "grad_norm": 1.453125, |
| "learning_rate": 4.66044776119403e-05, |
| "loss": 0.6844, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.6985915492957746, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.656716417910448e-05, |
| "loss": 0.7513, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.7061032863849765, |
| "grad_norm": 1.015625, |
| "learning_rate": 4.652985074626866e-05, |
| "loss": 0.6264, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.7136150234741784, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.6492537313432837e-05, |
| "loss": 0.7228, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.7211267605633803, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.645522388059701e-05, |
| "loss": 0.5355, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.7286384976525822, |
| "grad_norm": 1.359375, |
| "learning_rate": 4.6417910447761195e-05, |
| "loss": 0.7717, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.7361502347417841, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.638059701492538e-05, |
| "loss": 0.7365, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.7436619718309859, |
| "grad_norm": 3.375, |
| "learning_rate": 4.634328358208955e-05, |
| "loss": 0.8382, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.7511737089201878, |
| "grad_norm": 1.421875, |
| "learning_rate": 4.6305970149253736e-05, |
| "loss": 0.7674, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7586854460093897, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.626865671641791e-05, |
| "loss": 0.554, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.7661971830985915, |
| "grad_norm": 1.25, |
| "learning_rate": 4.623134328358209e-05, |
| "loss": 0.7921, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.7737089201877935, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.6194029850746277e-05, |
| "loss": 0.6282, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.7812206572769953, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.615671641791045e-05, |
| "loss": 0.7992, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.7887323943661971, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.611940298507463e-05, |
| "loss": 0.7313, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.7962441314553991, |
| "grad_norm": 0.9921875, |
| "learning_rate": 4.608208955223881e-05, |
| "loss": 0.7154, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.8037558685446009, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.6044776119402986e-05, |
| "loss": 0.6904, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.8112676056338028, |
| "grad_norm": 1.125, |
| "learning_rate": 4.600746268656716e-05, |
| "loss": 0.7075, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.8187793427230047, |
| "grad_norm": 2.609375, |
| "learning_rate": 4.5970149253731345e-05, |
| "loss": 0.7362, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.8262910798122066, |
| "grad_norm": 1.125, |
| "learning_rate": 4.593283582089553e-05, |
| "loss": 0.6568, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8338028169014085, |
| "grad_norm": 1.3984375, |
| "learning_rate": 4.58955223880597e-05, |
| "loss": 0.7508, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.8413145539906103, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.585820895522388e-05, |
| "loss": 0.8402, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.8488262910798122, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.582089552238806e-05, |
| "loss": 0.6946, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.856338028169014, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.578358208955224e-05, |
| "loss": 0.6877, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.863849765258216, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.574626865671642e-05, |
| "loss": 0.6703, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.8713615023474178, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.57089552238806e-05, |
| "loss": 0.6673, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.8788732394366198, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.567164179104478e-05, |
| "loss": 0.8, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.8863849765258216, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.5634328358208954e-05, |
| "loss": 0.7146, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.8938967136150234, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.5597014925373136e-05, |
| "loss": 0.7342, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.9014084507042254, |
| "grad_norm": 1.390625, |
| "learning_rate": 4.555970149253732e-05, |
| "loss": 0.865, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.9089201877934272, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.5522388059701495e-05, |
| "loss": 0.8743, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.9164319248826291, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.548507462686568e-05, |
| "loss": 0.8124, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.923943661971831, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.544776119402985e-05, |
| "loss": 0.739, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.9314553990610329, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.541044776119403e-05, |
| "loss": 0.9343, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.9389671361502347, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.537313432835821e-05, |
| "loss": 0.8239, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.9464788732394366, |
| "grad_norm": 1.125, |
| "learning_rate": 4.5335820895522394e-05, |
| "loss": 0.5928, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.9539906103286385, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.529850746268657e-05, |
| "loss": 0.6705, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.9615023474178404, |
| "grad_norm": 1.5390625, |
| "learning_rate": 4.526119402985075e-05, |
| "loss": 0.7937, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.9690140845070423, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.522388059701493e-05, |
| "loss": 0.7819, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.9765258215962441, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.5186567164179104e-05, |
| "loss": 0.7217, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.984037558685446, |
| "grad_norm": 0.97265625, |
| "learning_rate": 4.5149253731343286e-05, |
| "loss": 0.5954, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.9915492957746479, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.511194029850747e-05, |
| "loss": 0.6472, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.9990610328638497, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.5074626865671645e-05, |
| "loss": 0.6715, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.765625, |
| "learning_rate": 4.503731343283582e-05, |
| "loss": 0.7686, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.007511737089202, |
| "grad_norm": 1.625, |
| "learning_rate": 4.5e-05, |
| "loss": 0.6494, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.0150234741784037, |
| "grad_norm": 1.46875, |
| "learning_rate": 4.496268656716418e-05, |
| "loss": 0.5652, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.0225352112676056, |
| "grad_norm": 0.9296875, |
| "learning_rate": 4.492537313432836e-05, |
| "loss": 0.5451, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.0300469483568075, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.4888059701492544e-05, |
| "loss": 0.7535, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.0375586854460095, |
| "grad_norm": 1.328125, |
| "learning_rate": 4.485074626865672e-05, |
| "loss": 0.6217, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.0450704225352112, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.4813432835820895e-05, |
| "loss": 0.6696, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.0525821596244131, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.477611940298508e-05, |
| "loss": 0.5071, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.060093896713615, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.4738805970149254e-05, |
| "loss": 0.5824, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.0676056338028168, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.4701492537313436e-05, |
| "loss": 0.4513, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.0751173708920188, |
| "grad_norm": 1.3671875, |
| "learning_rate": 4.466417910447762e-05, |
| "loss": 0.6003, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.0826291079812207, |
| "grad_norm": 1.3515625, |
| "learning_rate": 4.4626865671641794e-05, |
| "loss": 0.6236, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.0901408450704226, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.458955223880597e-05, |
| "loss": 0.5397, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.0976525821596244, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.455223880597015e-05, |
| "loss": 0.8009, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.1051643192488263, |
| "grad_norm": 1.3125, |
| "learning_rate": 4.451492537313433e-05, |
| "loss": 0.7748, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.1126760563380282, |
| "grad_norm": 3.421875, |
| "learning_rate": 4.447761194029851e-05, |
| "loss": 0.7256, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.12018779342723, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.4440298507462694e-05, |
| "loss": 0.6101, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.127699530516432, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.440298507462687e-05, |
| "loss": 0.4897, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.1352112676056338, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.4365671641791045e-05, |
| "loss": 0.7608, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.1427230046948358, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.432835820895523e-05, |
| "loss": 0.6389, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.1502347417840375, |
| "grad_norm": 0.98828125, |
| "learning_rate": 4.4291044776119403e-05, |
| "loss": 0.4912, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.1577464788732394, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.4253731343283586e-05, |
| "loss": 0.6865, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.1652582159624414, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.421641791044777e-05, |
| "loss": 0.4687, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.172769953051643, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.4179104477611944e-05, |
| "loss": 0.6835, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.180281690140845, |
| "grad_norm": 1.015625, |
| "learning_rate": 4.414179104477612e-05, |
| "loss": 0.4748, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.187793427230047, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.4104477611940296e-05, |
| "loss": 0.556, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.1953051643192487, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.406716417910448e-05, |
| "loss": 0.4781, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.2028169014084507, |
| "grad_norm": 0.87890625, |
| "learning_rate": 4.402985074626866e-05, |
| "loss": 0.4693, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.2103286384976526, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.399253731343284e-05, |
| "loss": 0.5578, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.2178403755868545, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.395522388059702e-05, |
| "loss": 0.6094, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.2253521126760563, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.3917910447761195e-05, |
| "loss": 0.6457, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.2328638497652582, |
| "grad_norm": 0.96875, |
| "learning_rate": 4.388059701492537e-05, |
| "loss": 0.5279, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.2403755868544601, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.384328358208955e-05, |
| "loss": 0.5519, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.247887323943662, |
| "grad_norm": 0.97265625, |
| "learning_rate": 4.3805970149253736e-05, |
| "loss": 0.4328, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.2553990610328638, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.376865671641791e-05, |
| "loss": 0.6765, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.2629107981220657, |
| "grad_norm": 0.953125, |
| "learning_rate": 4.3731343283582094e-05, |
| "loss": 0.4776, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.2704225352112677, |
| "grad_norm": 1.125, |
| "learning_rate": 4.369402985074627e-05, |
| "loss": 0.4864, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.2779342723004694, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.3656716417910446e-05, |
| "loss": 0.5458, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.2854460093896714, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.361940298507463e-05, |
| "loss": 0.5282, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.2929577464788733, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.358208955223881e-05, |
| "loss": 0.5868, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.300469483568075, |
| "grad_norm": 0.94921875, |
| "learning_rate": 4.354477611940299e-05, |
| "loss": 0.5913, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.307981220657277, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.350746268656717e-05, |
| "loss": 0.6012, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.315492957746479, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.3470149253731345e-05, |
| "loss": 0.7648, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.3230046948356806, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.343283582089552e-05, |
| "loss": 0.5035, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.3305164319248826, |
| "grad_norm": 0.98046875, |
| "learning_rate": 4.33955223880597e-05, |
| "loss": 0.6096, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.3380281690140845, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.3358208955223886e-05, |
| "loss": 0.6028, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.3455399061032864, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.332089552238806e-05, |
| "loss": 0.4301, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.3530516431924884, |
| "grad_norm": 0.875, |
| "learning_rate": 4.328358208955224e-05, |
| "loss": 0.4578, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.36056338028169, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.324626865671642e-05, |
| "loss": 0.6293, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.368075117370892, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.3208955223880596e-05, |
| "loss": 0.6577, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.375586854460094, |
| "grad_norm": 1.0, |
| "learning_rate": 4.317164179104478e-05, |
| "loss": 0.5545, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.3830985915492957, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.313432835820896e-05, |
| "loss": 0.6445, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.3906103286384977, |
| "grad_norm": 0.9609375, |
| "learning_rate": 4.3097014925373137e-05, |
| "loss": 0.3414, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.3981220657276996, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.305970149253731e-05, |
| "loss": 0.6948, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.4056338028169013, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.3022388059701495e-05, |
| "loss": 0.4784, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.4131455399061033, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.298507462686567e-05, |
| "loss": 0.665, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.4206572769953052, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.294776119402985e-05, |
| "loss": 0.5881, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.428169014084507, |
| "grad_norm": 0.98046875, |
| "learning_rate": 4.2910447761194036e-05, |
| "loss": 0.4298, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.4356807511737089, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.287313432835821e-05, |
| "loss": 0.6543, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.4431924882629108, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.283582089552239e-05, |
| "loss": 0.4931, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.4507042253521127, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.279850746268657e-05, |
| "loss": 0.5279, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.4582159624413147, |
| "grad_norm": 1.015625, |
| "learning_rate": 4.2761194029850746e-05, |
| "loss": 0.5347, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.4657276995305164, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.272388059701493e-05, |
| "loss": 0.48, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.4732394366197183, |
| "grad_norm": 1.25, |
| "learning_rate": 4.268656716417911e-05, |
| "loss": 0.6988, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.4807511737089203, |
| "grad_norm": 1.421875, |
| "learning_rate": 4.2649253731343286e-05, |
| "loss": 0.5859, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.488262910798122, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.261194029850746e-05, |
| "loss": 0.5283, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.495774647887324, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.2574626865671645e-05, |
| "loss": 0.4631, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.503286384976526, |
| "grad_norm": 0.9921875, |
| "learning_rate": 4.253731343283582e-05, |
| "loss": 0.5231, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.5107981220657276, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.25e-05, |
| "loss": 0.5583, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.5183098591549296, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.2462686567164186e-05, |
| "loss": 0.4876, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.5258215962441315, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.242537313432836e-05, |
| "loss": 0.4504, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.5333333333333332, |
| "grad_norm": 0.93359375, |
| "learning_rate": 4.238805970149254e-05, |
| "loss": 0.4757, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.5408450704225352, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.235074626865671e-05, |
| "loss": 0.5933, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.548356807511737, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.2313432835820895e-05, |
| "loss": 0.5415, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.5558685446009388, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.227611940298508e-05, |
| "loss": 0.7041, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.563380281690141, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.2238805970149254e-05, |
| "loss": 0.5753, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.5708920187793427, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.2201492537313436e-05, |
| "loss": 0.552, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.5784037558685446, |
| "grad_norm": 1.0, |
| "learning_rate": 4.216417910447761e-05, |
| "loss": 0.5549, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.5859154929577466, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.212686567164179e-05, |
| "loss": 0.6679, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.5934272300469483, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.208955223880597e-05, |
| "loss": 0.796, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.6009389671361502, |
| "grad_norm": 2.953125, |
| "learning_rate": 4.205223880597015e-05, |
| "loss": 0.6427, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.6084507042253522, |
| "grad_norm": 0.90625, |
| "learning_rate": 4.201492537313433e-05, |
| "loss": 0.466, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.615962441314554, |
| "grad_norm": 0.94140625, |
| "learning_rate": 4.197761194029851e-05, |
| "loss": 0.4242, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.6234741784037559, |
| "grad_norm": 3.0, |
| "learning_rate": 4.194029850746269e-05, |
| "loss": 0.6968, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.6309859154929578, |
| "grad_norm": 0.9765625, |
| "learning_rate": 4.190298507462686e-05, |
| "loss": 0.5988, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.6384976525821595, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.186567164179105e-05, |
| "loss": 0.4979, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.6460093896713615, |
| "grad_norm": 0.83203125, |
| "learning_rate": 4.182835820895523e-05, |
| "loss": 0.3527, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.6535211267605634, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.1791044776119404e-05, |
| "loss": 0.5743, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.6610328638497651, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.1753731343283586e-05, |
| "loss": 0.6427, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.6685446009389673, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.171641791044776e-05, |
| "loss": 0.6496, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.676056338028169, |
| "grad_norm": 0.8515625, |
| "learning_rate": 4.167910447761194e-05, |
| "loss": 0.3463, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.683568075117371, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.164179104477613e-05, |
| "loss": 0.6243, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.6910798122065729, |
| "grad_norm": 0.98828125, |
| "learning_rate": 4.16044776119403e-05, |
| "loss": 0.5389, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.6985915492957746, |
| "grad_norm": 1.0, |
| "learning_rate": 4.156716417910448e-05, |
| "loss": 0.5515, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.7061032863849765, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.152985074626866e-05, |
| "loss": 0.6783, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.7136150234741785, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.149253731343284e-05, |
| "loss": 0.5099, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.7211267605633802, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.145522388059702e-05, |
| "loss": 0.461, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.7286384976525822, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.1417910447761195e-05, |
| "loss": 0.4953, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.736150234741784, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.138059701492538e-05, |
| "loss": 0.5662, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.7436619718309858, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.1343283582089554e-05, |
| "loss": 0.524, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.7511737089201878, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.130597014925373e-05, |
| "loss": 0.424, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.7586854460093897, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.126865671641791e-05, |
| "loss": 0.5042, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.7661971830985914, |
| "grad_norm": 0.9765625, |
| "learning_rate": 4.1231343283582094e-05, |
| "loss": 0.5515, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.7737089201877936, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.119402985074627e-05, |
| "loss": 0.6026, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.7812206572769953, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.115671641791045e-05, |
| "loss": 0.6206, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.788732394366197, |
| "grad_norm": 1.96875, |
| "learning_rate": 4.111940298507463e-05, |
| "loss": 0.6904, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.7962441314553992, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.1082089552238804e-05, |
| "loss": 0.4693, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.803755868544601, |
| "grad_norm": 0.8984375, |
| "learning_rate": 4.104477611940299e-05, |
| "loss": 0.3648, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.8112676056338028, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.100746268656717e-05, |
| "loss": 0.6071, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.8187793427230048, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.0970149253731345e-05, |
| "loss": 0.4936, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.8262910798122065, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.093283582089553e-05, |
| "loss": 0.6627, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.8338028169014085, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.0895522388059703e-05, |
| "loss": 0.4625, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.8413145539906104, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.085820895522388e-05, |
| "loss": 0.5132, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.8488262910798121, |
| "grad_norm": 0.91796875, |
| "learning_rate": 4.082089552238806e-05, |
| "loss": 0.5464, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.856338028169014, |
| "grad_norm": 1.5390625, |
| "learning_rate": 4.0783582089552244e-05, |
| "loss": 0.6268, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.863849765258216, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.074626865671642e-05, |
| "loss": 0.4702, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.8713615023474177, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.07089552238806e-05, |
| "loss": 0.6875, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.8788732394366199, |
| "grad_norm": 1.828125, |
| "learning_rate": 4.067164179104478e-05, |
| "loss": 0.5994, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.8863849765258216, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.0634328358208954e-05, |
| "loss": 0.513, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.8938967136150233, |
| "grad_norm": 0.99609375, |
| "learning_rate": 4.059701492537314e-05, |
| "loss": 0.5234, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.9014084507042255, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.055970149253732e-05, |
| "loss": 0.3845, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.9089201877934272, |
| "grad_norm": 0.8515625, |
| "learning_rate": 4.0522388059701495e-05, |
| "loss": 0.5088, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.9164319248826291, |
| "grad_norm": 1.3359375, |
| "learning_rate": 4.048507462686567e-05, |
| "loss": 0.5853, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.923943661971831, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.044776119402985e-05, |
| "loss": 0.5691, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.9314553990610328, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.041044776119403e-05, |
| "loss": 0.5251, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.9389671361502347, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.037313432835821e-05, |
| "loss": 0.5245, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.9464788732394367, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.0335820895522394e-05, |
| "loss": 0.5217, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.9539906103286384, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.029850746268657e-05, |
| "loss": 0.4795, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.9615023474178404, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.0261194029850746e-05, |
| "loss": 0.5037, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.9690140845070423, |
| "grad_norm": 1.0, |
| "learning_rate": 4.022388059701493e-05, |
| "loss": 0.5489, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.976525821596244, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.0186567164179104e-05, |
| "loss": 0.6005, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.984037558685446, |
| "grad_norm": 0.96875, |
| "learning_rate": 4.014925373134329e-05, |
| "loss": 0.4849, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.991549295774648, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.011194029850747e-05, |
| "loss": 0.541, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.9990610328638496, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.0074626865671645e-05, |
| "loss": 0.5955, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 3.390625, |
| "learning_rate": 4.003731343283582e-05, |
| "loss": 0.4322, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.0075117370892017, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4e-05, |
| "loss": 0.4255, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.015023474178404, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.996268656716418e-05, |
| "loss": 0.4638, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.0225352112676056, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.992537313432836e-05, |
| "loss": 0.4727, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.0300469483568073, |
| "grad_norm": 0.94921875, |
| "learning_rate": 3.9888059701492544e-05, |
| "loss": 0.3631, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.0375586854460095, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.985074626865672e-05, |
| "loss": 0.4291, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.045070422535211, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.9813432835820896e-05, |
| "loss": 0.4012, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.052582159624413, |
| "grad_norm": 1.453125, |
| "learning_rate": 3.977611940298508e-05, |
| "loss": 0.5699, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.060093896713615, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.9738805970149254e-05, |
| "loss": 0.4192, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.067605633802817, |
| "grad_norm": 1.5078125, |
| "learning_rate": 3.9701492537313437e-05, |
| "loss": 0.442, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.075117370892019, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.966417910447761e-05, |
| "loss": 0.3812, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.0826291079812207, |
| "grad_norm": 0.9609375, |
| "learning_rate": 3.9626865671641795e-05, |
| "loss": 0.3047, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.0901408450704224, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.958955223880597e-05, |
| "loss": 0.474, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.0976525821596246, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.9552238805970146e-05, |
| "loss": 0.4265, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.1051643192488263, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.951492537313433e-05, |
| "loss": 0.4247, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.112676056338028, |
| "grad_norm": 1.125, |
| "learning_rate": 3.947761194029851e-05, |
| "loss": 0.4229, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.12018779342723, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.944029850746269e-05, |
| "loss": 0.585, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.127699530516432, |
| "grad_norm": 0.9921875, |
| "learning_rate": 3.940298507462687e-05, |
| "loss": 0.4583, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.1352112676056336, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.9365671641791046e-05, |
| "loss": 0.4564, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.142723004694836, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.932835820895522e-05, |
| "loss": 0.4242, |
| "step": 287 |
| }, |
| { |
| "epoch": 2.1502347417840375, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.9291044776119404e-05, |
| "loss": 0.492, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.1577464788732392, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.9253731343283586e-05, |
| "loss": 0.389, |
| "step": 289 |
| }, |
| { |
| "epoch": 2.1652582159624414, |
| "grad_norm": 0.9765625, |
| "learning_rate": 3.921641791044776e-05, |
| "loss": 0.382, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.172769953051643, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.9179104477611945e-05, |
| "loss": 0.422, |
| "step": 291 |
| }, |
| { |
| "epoch": 2.1802816901408453, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.914179104477612e-05, |
| "loss": 0.4392, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.187793427230047, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.9104477611940296e-05, |
| "loss": 0.5264, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.1953051643192487, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.906716417910448e-05, |
| "loss": 0.3301, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.202816901408451, |
| "grad_norm": 0.96875, |
| "learning_rate": 3.902985074626866e-05, |
| "loss": 0.4047, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.2103286384976526, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.899253731343284e-05, |
| "loss": 0.4317, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.2178403755868543, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.895522388059702e-05, |
| "loss": 0.4581, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.2253521126760565, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.8917910447761195e-05, |
| "loss": 0.4516, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.232863849765258, |
| "grad_norm": 0.9140625, |
| "learning_rate": 3.888059701492537e-05, |
| "loss": 0.2895, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.24037558685446, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.8843283582089554e-05, |
| "loss": 0.4794, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.247887323943662, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.8805970149253736e-05, |
| "loss": 0.5919, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.255399061032864, |
| "grad_norm": 1.015625, |
| "learning_rate": 3.876865671641791e-05, |
| "loss": 0.4234, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.262910798122066, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.873134328358209e-05, |
| "loss": 0.4109, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.2704225352112677, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.869402985074627e-05, |
| "loss": 0.465, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.2779342723004694, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.8656716417910446e-05, |
| "loss": 0.4098, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.2854460093896716, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.861940298507463e-05, |
| "loss": 0.4484, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.2929577464788733, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.858208955223881e-05, |
| "loss": 0.4225, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.300469483568075, |
| "grad_norm": 1.4296875, |
| "learning_rate": 3.854477611940299e-05, |
| "loss": 0.5249, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.307981220657277, |
| "grad_norm": 0.96484375, |
| "learning_rate": 3.850746268656716e-05, |
| "loss": 0.376, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.315492957746479, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.8470149253731345e-05, |
| "loss": 0.4915, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.3230046948356806, |
| "grad_norm": 0.98046875, |
| "learning_rate": 3.843283582089552e-05, |
| "loss": 0.3064, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.330516431924883, |
| "grad_norm": 0.953125, |
| "learning_rate": 3.8395522388059704e-05, |
| "loss": 0.2775, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.3380281690140845, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.8358208955223886e-05, |
| "loss": 0.4981, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.345539906103286, |
| "grad_norm": 0.9375, |
| "learning_rate": 3.832089552238806e-05, |
| "loss": 0.3448, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.3530516431924884, |
| "grad_norm": 0.953125, |
| "learning_rate": 3.828358208955224e-05, |
| "loss": 0.3782, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.36056338028169, |
| "grad_norm": 1.4453125, |
| "learning_rate": 3.824626865671642e-05, |
| "loss": 0.6991, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.368075117370892, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.8208955223880596e-05, |
| "loss": 0.3665, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.375586854460094, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.817164179104478e-05, |
| "loss": 0.4168, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.3830985915492957, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.813432835820896e-05, |
| "loss": 0.4422, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.3906103286384974, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.809701492537314e-05, |
| "loss": 0.3613, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.3981220657276996, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.805970149253731e-05, |
| "loss": 0.4135, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.4056338028169013, |
| "grad_norm": 0.91015625, |
| "learning_rate": 3.8022388059701495e-05, |
| "loss": 0.3028, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.4131455399061035, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.798507462686567e-05, |
| "loss": 0.3747, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.420657276995305, |
| "grad_norm": 0.8671875, |
| "learning_rate": 3.7947761194029854e-05, |
| "loss": 0.2747, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.428169014084507, |
| "grad_norm": 0.8984375, |
| "learning_rate": 3.791044776119403e-05, |
| "loss": 0.3254, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.435680751173709, |
| "grad_norm": 0.984375, |
| "learning_rate": 3.787313432835821e-05, |
| "loss": 0.4222, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.443192488262911, |
| "grad_norm": 0.96484375, |
| "learning_rate": 3.783582089552239e-05, |
| "loss": 0.3116, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.4507042253521125, |
| "grad_norm": 4.25, |
| "learning_rate": 3.7798507462686563e-05, |
| "loss": 0.4996, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.4582159624413147, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.776119402985075e-05, |
| "loss": 0.4413, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.4657276995305164, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.772388059701493e-05, |
| "loss": 0.4048, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.473239436619718, |
| "grad_norm": 0.96484375, |
| "learning_rate": 3.7686567164179104e-05, |
| "loss": 0.4141, |
| "step": 331 |
| }, |
| { |
| "epoch": 2.4807511737089203, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.764925373134329e-05, |
| "loss": 0.358, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.488262910798122, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.761194029850746e-05, |
| "loss": 0.358, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.495774647887324, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.757462686567164e-05, |
| "loss": 0.5101, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.503286384976526, |
| "grad_norm": 1.125, |
| "learning_rate": 3.753731343283583e-05, |
| "loss": 0.4161, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.5107981220657276, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.4621, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.5183098591549298, |
| "grad_norm": 0.99609375, |
| "learning_rate": 3.746268656716418e-05, |
| "loss": 0.4423, |
| "step": 337 |
| }, |
| { |
| "epoch": 2.5258215962441315, |
| "grad_norm": 0.921875, |
| "learning_rate": 3.742537313432836e-05, |
| "loss": 0.3696, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.533333333333333, |
| "grad_norm": 0.97265625, |
| "learning_rate": 3.738805970149254e-05, |
| "loss": 0.3723, |
| "step": 339 |
| }, |
| { |
| "epoch": 2.5408450704225354, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.735074626865671e-05, |
| "loss": 0.478, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.548356807511737, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.73134328358209e-05, |
| "loss": 0.4382, |
| "step": 341 |
| }, |
| { |
| "epoch": 2.555868544600939, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.727611940298508e-05, |
| "loss": 0.3668, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.563380281690141, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.7238805970149254e-05, |
| "loss": 0.4139, |
| "step": 343 |
| }, |
| { |
| "epoch": 2.5708920187793427, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.720149253731344e-05, |
| "loss": 0.4257, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.5784037558685444, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.716417910447761e-05, |
| "loss": 0.49, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.5859154929577466, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.7126865671641795e-05, |
| "loss": 0.4485, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.5934272300469483, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.708955223880598e-05, |
| "loss": 0.451, |
| "step": 347 |
| }, |
| { |
| "epoch": 2.60093896713615, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.7052238805970153e-05, |
| "loss": 0.4173, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.608450704225352, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.701492537313433e-05, |
| "loss": 0.3067, |
| "step": 349 |
| }, |
| { |
| "epoch": 2.615962441314554, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.6977611940298505e-05, |
| "loss": 0.3344, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.6234741784037556, |
| "grad_norm": 1.0, |
| "learning_rate": 3.694029850746269e-05, |
| "loss": 0.3942, |
| "step": 351 |
| }, |
| { |
| "epoch": 2.630985915492958, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.690298507462687e-05, |
| "loss": 0.5222, |
| "step": 352 |
| }, |
| { |
| "epoch": 2.6384976525821595, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.6865671641791046e-05, |
| "loss": 0.2982, |
| "step": 353 |
| }, |
| { |
| "epoch": 2.6460093896713612, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.682835820895523e-05, |
| "loss": 0.4765, |
| "step": 354 |
| }, |
| { |
| "epoch": 2.6535211267605634, |
| "grad_norm": 0.9921875, |
| "learning_rate": 3.6791044776119404e-05, |
| "loss": 0.3604, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.661032863849765, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.675373134328358e-05, |
| "loss": 0.4295, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.6685446009389673, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.671641791044776e-05, |
| "loss": 0.3636, |
| "step": 357 |
| }, |
| { |
| "epoch": 2.676056338028169, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.6679104477611945e-05, |
| "loss": 0.4205, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.683568075117371, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.664179104477612e-05, |
| "loss": 0.6766, |
| "step": 359 |
| }, |
| { |
| "epoch": 2.691079812206573, |
| "grad_norm": 0.94921875, |
| "learning_rate": 3.66044776119403e-05, |
| "loss": 0.3813, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.6985915492957746, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.656716417910448e-05, |
| "loss": 0.4775, |
| "step": 361 |
| }, |
| { |
| "epoch": 2.7061032863849768, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.6529850746268655e-05, |
| "loss": 0.4854, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.7136150234741785, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.649253731343284e-05, |
| "loss": 0.5122, |
| "step": 363 |
| }, |
| { |
| "epoch": 2.72112676056338, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.645522388059702e-05, |
| "loss": 0.389, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.7286384976525824, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.6417910447761196e-05, |
| "loss": 0.4442, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.736150234741784, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.638059701492538e-05, |
| "loss": 0.3946, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.743661971830986, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.6343283582089554e-05, |
| "loss": 0.3582, |
| "step": 367 |
| }, |
| { |
| "epoch": 2.751173708920188, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.630597014925373e-05, |
| "loss": 0.4962, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.7586854460093897, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.626865671641791e-05, |
| "loss": 0.4519, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.7661971830985914, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.6231343283582095e-05, |
| "loss": 0.4805, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.7737089201877936, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.619402985074627e-05, |
| "loss": 0.3858, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.7812206572769953, |
| "grad_norm": 1.25, |
| "learning_rate": 3.6156716417910446e-05, |
| "loss": 0.3975, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.788732394366197, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.611940298507463e-05, |
| "loss": 0.5112, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.796244131455399, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.6082089552238805e-05, |
| "loss": 0.4478, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.803755868544601, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.604477611940299e-05, |
| "loss": 0.3357, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.8112676056338026, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.600746268656717e-05, |
| "loss": 0.3859, |
| "step": 376 |
| }, |
| { |
| "epoch": 2.818779342723005, |
| "grad_norm": 0.921875, |
| "learning_rate": 3.5970149253731346e-05, |
| "loss": 0.3302, |
| "step": 377 |
| }, |
| { |
| "epoch": 2.8262910798122065, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.593283582089552e-05, |
| "loss": 0.4415, |
| "step": 378 |
| }, |
| { |
| "epoch": 2.8338028169014082, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.5895522388059704e-05, |
| "loss": 0.322, |
| "step": 379 |
| }, |
| { |
| "epoch": 2.8413145539906104, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.585820895522388e-05, |
| "loss": 0.3689, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.848826291079812, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.582089552238806e-05, |
| "loss": 0.4491, |
| "step": 381 |
| }, |
| { |
| "epoch": 2.856338028169014, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.5783582089552245e-05, |
| "loss": 0.5455, |
| "step": 382 |
| }, |
| { |
| "epoch": 2.863849765258216, |
| "grad_norm": 1.625, |
| "learning_rate": 3.574626865671642e-05, |
| "loss": 0.4963, |
| "step": 383 |
| }, |
| { |
| "epoch": 2.8713615023474177, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.5708955223880596e-05, |
| "loss": 0.5053, |
| "step": 384 |
| }, |
| { |
| "epoch": 2.87887323943662, |
| "grad_norm": 1.0, |
| "learning_rate": 3.567164179104478e-05, |
| "loss": 0.4466, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.8863849765258216, |
| "grad_norm": 0.9375, |
| "learning_rate": 3.5634328358208955e-05, |
| "loss": 0.3599, |
| "step": 386 |
| }, |
| { |
| "epoch": 2.8938967136150233, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.559701492537314e-05, |
| "loss": 0.4515, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.9014084507042255, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.555970149253732e-05, |
| "loss": 0.4526, |
| "step": 388 |
| }, |
| { |
| "epoch": 2.908920187793427, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.5522388059701495e-05, |
| "loss": 0.4239, |
| "step": 389 |
| }, |
| { |
| "epoch": 2.9164319248826294, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.548507462686567e-05, |
| "loss": 0.3883, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.923943661971831, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.5447761194029854e-05, |
| "loss": 0.486, |
| "step": 391 |
| }, |
| { |
| "epoch": 2.931455399061033, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.541044776119403e-05, |
| "loss": 0.502, |
| "step": 392 |
| }, |
| { |
| "epoch": 2.938967136150235, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.537313432835821e-05, |
| "loss": 0.5833, |
| "step": 393 |
| }, |
| { |
| "epoch": 2.9464788732394367, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.5335820895522395e-05, |
| "loss": 0.3583, |
| "step": 394 |
| }, |
| { |
| "epoch": 2.9539906103286384, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.529850746268657e-05, |
| "loss": 0.3327, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.9615023474178406, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.5261194029850746e-05, |
| "loss": 0.4078, |
| "step": 396 |
| }, |
| { |
| "epoch": 2.9690140845070423, |
| "grad_norm": 0.97265625, |
| "learning_rate": 3.522388059701492e-05, |
| "loss": 0.3805, |
| "step": 397 |
| }, |
| { |
| "epoch": 2.976525821596244, |
| "grad_norm": 0.953125, |
| "learning_rate": 3.5186567164179105e-05, |
| "loss": 0.3543, |
| "step": 398 |
| }, |
| { |
| "epoch": 2.984037558685446, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.514925373134329e-05, |
| "loss": 0.3794, |
| "step": 399 |
| }, |
| { |
| "epoch": 2.991549295774648, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.511194029850746e-05, |
| "loss": 0.4541, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.9990610328638496, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.5074626865671645e-05, |
| "loss": 0.4267, |
| "step": 401 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 3.6875, |
| "learning_rate": 3.503731343283582e-05, |
| "loss": 0.3003, |
| "step": 402 |
| }, |
| { |
| "epoch": 3.0075117370892017, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.5e-05, |
| "loss": 0.3058, |
| "step": 403 |
| }, |
| { |
| "epoch": 3.015023474178404, |
| "grad_norm": 0.984375, |
| "learning_rate": 3.496268656716418e-05, |
| "loss": 0.234, |
| "step": 404 |
| }, |
| { |
| "epoch": 3.0225352112676056, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.492537313432836e-05, |
| "loss": 0.3378, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.0300469483568073, |
| "grad_norm": 0.94921875, |
| "learning_rate": 3.488805970149254e-05, |
| "loss": 0.2556, |
| "step": 406 |
| }, |
| { |
| "epoch": 3.0375586854460095, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.485074626865672e-05, |
| "loss": 0.2731, |
| "step": 407 |
| }, |
| { |
| "epoch": 3.045070422535211, |
| "grad_norm": 1.25, |
| "learning_rate": 3.4813432835820896e-05, |
| "loss": 0.2886, |
| "step": 408 |
| }, |
| { |
| "epoch": 3.052582159624413, |
| "grad_norm": 2.171875, |
| "learning_rate": 3.477611940298507e-05, |
| "loss": 0.3671, |
| "step": 409 |
| }, |
| { |
| "epoch": 3.060093896713615, |
| "grad_norm": 1.8125, |
| "learning_rate": 3.4738805970149254e-05, |
| "loss": 0.3331, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.067605633802817, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.470149253731344e-05, |
| "loss": 0.3577, |
| "step": 411 |
| }, |
| { |
| "epoch": 3.075117370892019, |
| "grad_norm": 1.890625, |
| "learning_rate": 3.466417910447761e-05, |
| "loss": 0.2809, |
| "step": 412 |
| }, |
| { |
| "epoch": 3.0826291079812207, |
| "grad_norm": 1.4609375, |
| "learning_rate": 3.4626865671641795e-05, |
| "loss": 0.3122, |
| "step": 413 |
| }, |
| { |
| "epoch": 3.0901408450704224, |
| "grad_norm": 1.125, |
| "learning_rate": 3.458955223880597e-05, |
| "loss": 0.2599, |
| "step": 414 |
| }, |
| { |
| "epoch": 3.0976525821596246, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.455223880597015e-05, |
| "loss": 0.3066, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.1051643192488263, |
| "grad_norm": 1.125, |
| "learning_rate": 3.451492537313433e-05, |
| "loss": 0.266, |
| "step": 416 |
| }, |
| { |
| "epoch": 3.112676056338028, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.447761194029851e-05, |
| "loss": 0.3201, |
| "step": 417 |
| }, |
| { |
| "epoch": 3.12018779342723, |
| "grad_norm": 0.89453125, |
| "learning_rate": 3.444029850746269e-05, |
| "loss": 0.1919, |
| "step": 418 |
| }, |
| { |
| "epoch": 3.127699530516432, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.440298507462687e-05, |
| "loss": 0.3906, |
| "step": 419 |
| }, |
| { |
| "epoch": 3.1352112676056336, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.4365671641791046e-05, |
| "loss": 0.2502, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.142723004694836, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.432835820895522e-05, |
| "loss": 0.2846, |
| "step": 421 |
| }, |
| { |
| "epoch": 3.1502347417840375, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.4291044776119404e-05, |
| "loss": 0.3431, |
| "step": 422 |
| }, |
| { |
| "epoch": 3.1577464788732392, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.425373134328359e-05, |
| "loss": 0.2536, |
| "step": 423 |
| }, |
| { |
| "epoch": 3.1652582159624414, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.421641791044776e-05, |
| "loss": 0.2595, |
| "step": 424 |
| }, |
| { |
| "epoch": 3.172769953051643, |
| "grad_norm": 1.25, |
| "learning_rate": 3.417910447761194e-05, |
| "loss": 0.3605, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.1802816901408453, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.414179104477612e-05, |
| "loss": 0.4784, |
| "step": 426 |
| }, |
| { |
| "epoch": 3.187793427230047, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.41044776119403e-05, |
| "loss": 0.2589, |
| "step": 427 |
| }, |
| { |
| "epoch": 3.1953051643192487, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.406716417910448e-05, |
| "loss": 0.2985, |
| "step": 428 |
| }, |
| { |
| "epoch": 3.202816901408451, |
| "grad_norm": 1.4375, |
| "learning_rate": 3.402985074626866e-05, |
| "loss": 0.2615, |
| "step": 429 |
| }, |
| { |
| "epoch": 3.2103286384976526, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.399253731343284e-05, |
| "loss": 0.2181, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.2178403755868543, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.395522388059701e-05, |
| "loss": 0.3046, |
| "step": 431 |
| }, |
| { |
| "epoch": 3.2253521126760565, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.3917910447761196e-05, |
| "loss": 0.2563, |
| "step": 432 |
| }, |
| { |
| "epoch": 3.232863849765258, |
| "grad_norm": 0.98046875, |
| "learning_rate": 3.388059701492537e-05, |
| "loss": 0.2574, |
| "step": 433 |
| }, |
| { |
| "epoch": 3.24037558685446, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.3843283582089554e-05, |
| "loss": 0.2544, |
| "step": 434 |
| }, |
| { |
| "epoch": 3.247887323943662, |
| "grad_norm": 1.0078125, |
| "learning_rate": 3.380597014925374e-05, |
| "loss": 0.2205, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.255399061032864, |
| "grad_norm": 0.9453125, |
| "learning_rate": 3.376865671641791e-05, |
| "loss": 0.2099, |
| "step": 436 |
| }, |
| { |
| "epoch": 3.262910798122066, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.373134328358209e-05, |
| "loss": 0.3441, |
| "step": 437 |
| }, |
| { |
| "epoch": 3.2704225352112677, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.369402985074627e-05, |
| "loss": 0.2991, |
| "step": 438 |
| }, |
| { |
| "epoch": 3.2779342723004694, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.365671641791045e-05, |
| "loss": 0.2694, |
| "step": 439 |
| }, |
| { |
| "epoch": 3.2854460093896716, |
| "grad_norm": 0.9765625, |
| "learning_rate": 3.361940298507463e-05, |
| "loss": 0.2535, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.2929577464788733, |
| "grad_norm": 1.4375, |
| "learning_rate": 3.358208955223881e-05, |
| "loss": 0.3394, |
| "step": 441 |
| }, |
| { |
| "epoch": 3.300469483568075, |
| "grad_norm": 1.3984375, |
| "learning_rate": 3.354477611940299e-05, |
| "loss": 0.4334, |
| "step": 442 |
| }, |
| { |
| "epoch": 3.307981220657277, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.350746268656716e-05, |
| "loss": 0.3504, |
| "step": 443 |
| }, |
| { |
| "epoch": 3.315492957746479, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.347014925373134e-05, |
| "loss": 0.3178, |
| "step": 444 |
| }, |
| { |
| "epoch": 3.3230046948356806, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.343283582089553e-05, |
| "loss": 0.4878, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.330516431924883, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.3395522388059704e-05, |
| "loss": 0.3015, |
| "step": 446 |
| }, |
| { |
| "epoch": 3.3380281690140845, |
| "grad_norm": 1.3203125, |
| "learning_rate": 3.335820895522388e-05, |
| "loss": 0.2809, |
| "step": 447 |
| }, |
| { |
| "epoch": 3.345539906103286, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.332089552238806e-05, |
| "loss": 0.2838, |
| "step": 448 |
| }, |
| { |
| "epoch": 3.3530516431924884, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.328358208955224e-05, |
| "loss": 0.2251, |
| "step": 449 |
| }, |
| { |
| "epoch": 3.36056338028169, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.3246268656716414e-05, |
| "loss": 0.3665, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.368075117370892, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.32089552238806e-05, |
| "loss": 0.3966, |
| "step": 451 |
| }, |
| { |
| "epoch": 3.375586854460094, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.317164179104478e-05, |
| "loss": 0.2999, |
| "step": 452 |
| }, |
| { |
| "epoch": 3.3830985915492957, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.3134328358208955e-05, |
| "loss": 0.2774, |
| "step": 453 |
| }, |
| { |
| "epoch": 3.3906103286384974, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.309701492537314e-05, |
| "loss": 0.4075, |
| "step": 454 |
| }, |
| { |
| "epoch": 3.3981220657276996, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.305970149253731e-05, |
| "loss": 0.2425, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.4056338028169013, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.302238805970149e-05, |
| "loss": 0.3725, |
| "step": 456 |
| }, |
| { |
| "epoch": 3.4131455399061035, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.298507462686568e-05, |
| "loss": 0.2723, |
| "step": 457 |
| }, |
| { |
| "epoch": 3.420657276995305, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.2947761194029854e-05, |
| "loss": 0.3451, |
| "step": 458 |
| }, |
| { |
| "epoch": 3.428169014084507, |
| "grad_norm": 1.0078125, |
| "learning_rate": 3.291044776119403e-05, |
| "loss": 0.251, |
| "step": 459 |
| }, |
| { |
| "epoch": 3.435680751173709, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.287313432835821e-05, |
| "loss": 0.3167, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.443192488262911, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.283582089552239e-05, |
| "loss": 0.2979, |
| "step": 461 |
| }, |
| { |
| "epoch": 3.4507042253521125, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.279850746268657e-05, |
| "loss": 0.2733, |
| "step": 462 |
| }, |
| { |
| "epoch": 3.4582159624413147, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.276119402985075e-05, |
| "loss": 0.5612, |
| "step": 463 |
| }, |
| { |
| "epoch": 3.4657276995305164, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.272388059701493e-05, |
| "loss": 0.3, |
| "step": 464 |
| }, |
| { |
| "epoch": 3.473239436619718, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.2686567164179105e-05, |
| "loss": 0.3114, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.4807511737089203, |
| "grad_norm": 1.9609375, |
| "learning_rate": 3.264925373134329e-05, |
| "loss": 0.4609, |
| "step": 466 |
| }, |
| { |
| "epoch": 3.488262910798122, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.261194029850746e-05, |
| "loss": 0.3911, |
| "step": 467 |
| }, |
| { |
| "epoch": 3.495774647887324, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.2574626865671646e-05, |
| "loss": 0.3526, |
| "step": 468 |
| }, |
| { |
| "epoch": 3.503286384976526, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.253731343283582e-05, |
| "loss": 0.2948, |
| "step": 469 |
| }, |
| { |
| "epoch": 3.5107981220657276, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.2500000000000004e-05, |
| "loss": 0.3205, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.5183098591549298, |
| "grad_norm": 0.95703125, |
| "learning_rate": 3.246268656716418e-05, |
| "loss": 0.2793, |
| "step": 471 |
| }, |
| { |
| "epoch": 3.5258215962441315, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.2425373134328355e-05, |
| "loss": 0.2795, |
| "step": 472 |
| }, |
| { |
| "epoch": 3.533333333333333, |
| "grad_norm": 0.859375, |
| "learning_rate": 3.238805970149254e-05, |
| "loss": 0.2497, |
| "step": 473 |
| }, |
| { |
| "epoch": 3.5408450704225354, |
| "grad_norm": 1.125, |
| "learning_rate": 3.235074626865672e-05, |
| "loss": 0.2743, |
| "step": 474 |
| }, |
| { |
| "epoch": 3.548356807511737, |
| "grad_norm": 0.94140625, |
| "learning_rate": 3.2313432835820896e-05, |
| "loss": 0.2061, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.555868544600939, |
| "grad_norm": 1.3203125, |
| "learning_rate": 3.227611940298508e-05, |
| "loss": 0.3852, |
| "step": 476 |
| }, |
| { |
| "epoch": 3.563380281690141, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.2238805970149255e-05, |
| "loss": 0.2777, |
| "step": 477 |
| }, |
| { |
| "epoch": 3.5708920187793427, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.220149253731343e-05, |
| "loss": 0.2988, |
| "step": 478 |
| }, |
| { |
| "epoch": 3.5784037558685444, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.216417910447761e-05, |
| "loss": 0.2646, |
| "step": 479 |
| }, |
| { |
| "epoch": 3.5859154929577466, |
| "grad_norm": 1.125, |
| "learning_rate": 3.2126865671641796e-05, |
| "loss": 0.2648, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.5934272300469483, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.208955223880597e-05, |
| "loss": 0.3574, |
| "step": 481 |
| }, |
| { |
| "epoch": 3.60093896713615, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.2052238805970154e-05, |
| "loss": 0.2751, |
| "step": 482 |
| }, |
| { |
| "epoch": 3.608450704225352, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.201492537313433e-05, |
| "loss": 0.2955, |
| "step": 483 |
| }, |
| { |
| "epoch": 3.615962441314554, |
| "grad_norm": 0.99609375, |
| "learning_rate": 3.1977611940298505e-05, |
| "loss": 0.2677, |
| "step": 484 |
| }, |
| { |
| "epoch": 3.6234741784037556, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.194029850746269e-05, |
| "loss": 0.3071, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.630985915492958, |
| "grad_norm": 1.25, |
| "learning_rate": 3.190298507462687e-05, |
| "loss": 0.3171, |
| "step": 486 |
| }, |
| { |
| "epoch": 3.6384976525821595, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.1865671641791046e-05, |
| "loss": 0.2817, |
| "step": 487 |
| }, |
| { |
| "epoch": 3.6460093896713612, |
| "grad_norm": 6.4375, |
| "learning_rate": 3.182835820895523e-05, |
| "loss": 0.4779, |
| "step": 488 |
| }, |
| { |
| "epoch": 3.6535211267605634, |
| "grad_norm": 1.0, |
| "learning_rate": 3.1791044776119405e-05, |
| "loss": 0.2539, |
| "step": 489 |
| }, |
| { |
| "epoch": 3.661032863849765, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.175373134328358e-05, |
| "loss": 0.2517, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.6685446009389673, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.171641791044776e-05, |
| "loss": 0.2995, |
| "step": 491 |
| }, |
| { |
| "epoch": 3.676056338028169, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.1679104477611945e-05, |
| "loss": 0.3996, |
| "step": 492 |
| }, |
| { |
| "epoch": 3.683568075117371, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.164179104477612e-05, |
| "loss": 0.367, |
| "step": 493 |
| }, |
| { |
| "epoch": 3.691079812206573, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.16044776119403e-05, |
| "loss": 0.216, |
| "step": 494 |
| }, |
| { |
| "epoch": 3.6985915492957746, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.156716417910448e-05, |
| "loss": 0.3269, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.7061032863849768, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.1529850746268655e-05, |
| "loss": 0.3337, |
| "step": 496 |
| }, |
| { |
| "epoch": 3.7136150234741785, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.149253731343284e-05, |
| "loss": 0.3343, |
| "step": 497 |
| }, |
| { |
| "epoch": 3.72112676056338, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.145522388059702e-05, |
| "loss": 0.3311, |
| "step": 498 |
| }, |
| { |
| "epoch": 3.7286384976525824, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.1417910447761196e-05, |
| "loss": 0.3231, |
| "step": 499 |
| }, |
| { |
| "epoch": 3.736150234741784, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.138059701492537e-05, |
| "loss": 0.226, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.743661971830986, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.1343283582089554e-05, |
| "loss": 0.2367, |
| "step": 501 |
| }, |
| { |
| "epoch": 3.751173708920188, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.130597014925373e-05, |
| "loss": 0.3148, |
| "step": 502 |
| }, |
| { |
| "epoch": 3.7586854460093897, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.126865671641791e-05, |
| "loss": 0.4082, |
| "step": 503 |
| }, |
| { |
| "epoch": 3.7661971830985914, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.1231343283582095e-05, |
| "loss": 0.3441, |
| "step": 504 |
| }, |
| { |
| "epoch": 3.7737089201877936, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.119402985074627e-05, |
| "loss": 0.3403, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.7812206572769953, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.115671641791045e-05, |
| "loss": 0.303, |
| "step": 506 |
| }, |
| { |
| "epoch": 3.788732394366197, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.111940298507463e-05, |
| "loss": 0.2613, |
| "step": 507 |
| }, |
| { |
| "epoch": 3.796244131455399, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.1082089552238805e-05, |
| "loss": 0.2781, |
| "step": 508 |
| }, |
| { |
| "epoch": 3.803755868544601, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.104477611940299e-05, |
| "loss": 0.268, |
| "step": 509 |
| }, |
| { |
| "epoch": 3.8112676056338026, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.100746268656717e-05, |
| "loss": 0.2663, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.818779342723005, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.0970149253731346e-05, |
| "loss": 0.3102, |
| "step": 511 |
| }, |
| { |
| "epoch": 3.8262910798122065, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.093283582089552e-05, |
| "loss": 0.4035, |
| "step": 512 |
| }, |
| { |
| "epoch": 3.8338028169014082, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.0895522388059704e-05, |
| "loss": 0.2987, |
| "step": 513 |
| }, |
| { |
| "epoch": 3.8413145539906104, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.085820895522388e-05, |
| "loss": 0.2892, |
| "step": 514 |
| }, |
| { |
| "epoch": 3.848826291079812, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.082089552238806e-05, |
| "loss": 0.4299, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.856338028169014, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.078358208955224e-05, |
| "loss": 0.3571, |
| "step": 516 |
| }, |
| { |
| "epoch": 3.863849765258216, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.074626865671642e-05, |
| "loss": 0.4082, |
| "step": 517 |
| }, |
| { |
| "epoch": 3.8713615023474177, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.07089552238806e-05, |
| "loss": 0.3317, |
| "step": 518 |
| }, |
| { |
| "epoch": 3.87887323943662, |
| "grad_norm": 1.375, |
| "learning_rate": 3.067164179104477e-05, |
| "loss": 0.3549, |
| "step": 519 |
| }, |
| { |
| "epoch": 3.8863849765258216, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.0634328358208955e-05, |
| "loss": 0.2538, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.8938967136150233, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.059701492537314e-05, |
| "loss": 0.2683, |
| "step": 521 |
| }, |
| { |
| "epoch": 3.9014084507042255, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.055970149253731e-05, |
| "loss": 0.3572, |
| "step": 522 |
| }, |
| { |
| "epoch": 3.908920187793427, |
| "grad_norm": 1.125, |
| "learning_rate": 3.0522388059701496e-05, |
| "loss": 0.2967, |
| "step": 523 |
| }, |
| { |
| "epoch": 3.9164319248826294, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.0485074626865672e-05, |
| "loss": 0.3288, |
| "step": 524 |
| }, |
| { |
| "epoch": 3.923943661971831, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.044776119402985e-05, |
| "loss": 0.2934, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.931455399061033, |
| "grad_norm": 1.125, |
| "learning_rate": 3.0410447761194033e-05, |
| "loss": 0.3035, |
| "step": 526 |
| }, |
| { |
| "epoch": 3.938967136150235, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.037313432835821e-05, |
| "loss": 0.2851, |
| "step": 527 |
| }, |
| { |
| "epoch": 3.9464788732394367, |
| "grad_norm": 0.9765625, |
| "learning_rate": 3.033582089552239e-05, |
| "loss": 0.2385, |
| "step": 528 |
| }, |
| { |
| "epoch": 3.9539906103286384, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.029850746268657e-05, |
| "loss": 0.2407, |
| "step": 529 |
| }, |
| { |
| "epoch": 3.9615023474178406, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.0261194029850747e-05, |
| "loss": 0.273, |
| "step": 530 |
| }, |
| { |
| "epoch": 3.9690140845070423, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.0223880597014926e-05, |
| "loss": 0.4011, |
| "step": 531 |
| }, |
| { |
| "epoch": 3.976525821596244, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.018656716417911e-05, |
| "loss": 0.2457, |
| "step": 532 |
| }, |
| { |
| "epoch": 3.984037558685446, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.0149253731343284e-05, |
| "loss": 0.3084, |
| "step": 533 |
| }, |
| { |
| "epoch": 3.991549295774648, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.0111940298507463e-05, |
| "loss": 0.2906, |
| "step": 534 |
| }, |
| { |
| "epoch": 3.9990610328638496, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.0074626865671646e-05, |
| "loss": 0.3793, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 4.125, |
| "learning_rate": 3.003731343283582e-05, |
| "loss": 0.3955, |
| "step": 536 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1340, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.7424015854128333e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|